PSARC/2006/357 Crossbow - Network Virtualization and Resource Management

6498311 Crossbow - Network Virtualization and Resource Management 6402493 DLPI provider loopback behavior should be improved 6453165 move mac capabs definitions outside mac.h 6338667 Need ability to use NAT for non-global zones 6692884 several threads hung due to deadlock scenario between aggr and mac 6768302 dls: soft_ring_bind/unbind race can panic in thread_affinity_set with cpu_id == -1 6635849 race between lacp_xmit_sm() and aggr_m_stop() ends in panic 6742712 potential message double free in the aggr driver 6754299 a potential race between aggr_m_tx() and aggr_port_delete() 6485324 mi_data_lock recursively held when enabling promiscuous mode on an aggregation 6442559 Forwarding perf bottleneck due to mac_rx() calls 6505462 assertion failure after removing a port from a snooped aggregation 6716664 need to add src/dst IP address to soft ring fanout --HG-- rename : usr/src/uts/common/io/dls/dls_soft_ring.c => usr/src/uts/common/io/mac/mac_soft_ring.c rename : usr/src/uts/common/inet/ip/ip_cksum.c => usr/src/uts/common/os/ip_cksum.c rename : usr/src/uts/common/inet/sctp_crc32.c => usr/src/uts/common/os/sctp_crc32.c rename : usr/src/uts/common/sys/dls_soft_ring.h => usr/src/uts/common/sys/mac_soft_ring.h
author: Eric Cheng <none@none> 2008-12-04 18:16:10 -0800
committer: Eric Cheng <none@none> 2008-12-04 18:16:10 -0800
commit: da14cebe459d3275048785f25bd869cb09b5307f (patch)
tree: a394d2c61ec4d7591782a4a5db4e3a157c3ca89a /usr/src
parent: 03361682bf38acf5bcc36ee83a0d6277731eee68 (diff)
download: illumos-joyent-da14cebe459d3275048785f25bd869cb09b5307f.tar.gz
326 files changed, 55600 insertions, 23414 deletions
diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile
index 927f5ca801..5d7c5f0f8c 100644
--- a/usr/src/cmd/Makefile
+++ b/usr/src/cmd/Makefile
@@ -161,6 +161,7 @@ COMMON_SUBDIRS=		\
 	file		\
 	filebench	\
 	find		\
+	flowadm		\
 	fm		\
 	fmli		\
 	fmt		\
@@ -582,6 +583,7 @@ MSGSUBDIRS=		\
 	file		\
 	filesync	\
 	find		\
+	flowadm		\
 	fm		\
 	fold		\
 	fs.d		\
diff --git a/usr/src/cmd/Makefile.cmd b/usr/src/cmd/Makefile.cmd
index 44364753b2..8abf748eab 100644
--- a/usr/src/cmd/Makefile.cmd
+++ b/usr/src/cmd/Makefile.cmd
@@ -66,6 +66,7 @@ ROOTETCTSOL=	$(ROOTETCSECURITY)/tsol
 ROOTETCSECLIB=	$(ROOTETCSECURITY)/lib
 ROOTETCZONES=	$(ROOTETC)/zones
 
+ROOTETCINET=	$(ROOT)/etc/inet
 ROOTCCSBIN=	$(ROOT)/usr/ccs/bin
 ROOTCCSBIN64=	$(ROOTCCSBIN)/$(MACH64)
 ROOTCCSBINLINKDIR=	$(ROOT)/../../bin
@@ -316,6 +317,9 @@ $(ROOTUSRSBIN64)/%: %
 $(ROOTETC)/%: %
 	$(INS.file)
 
+$(ROOTETCINET)/%: %
+	$(INS.file)
+
 $(ROOTETCDEFAULT)/%:	%.dfl
 	$(INS.rename)
 
diff --git a/usr/src/cmd/acctadm/Makefile b/usr/src/cmd/acctadm/Makefile
index 554135fa78..09343cbca7 100644
--- a/usr/src/cmd/acctadm/Makefile
+++ b/usr/src/cmd/acctadm/Makefile
@@ -19,8 +19,6 @@
 # CDDL HEADER END
 #
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
-#
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
@@ -37,7 +35,7 @@ include ../Makefile.cmd
 ROOTMANIFESTDIR = $(ROOTSVCSYSTEM)
 
 CFLAGS += $(CCVERBOSE)
-LDLIBS += -lexacct -lscf -lsecdb
+LDLIBS += -lexacct -lscf -lsecdb -ldladm
 POFILE = acctadm.po
 XGETFLAGS = -a -x acctadm.xcl
 FILEMODE = 0555
diff --git a/usr/src/cmd/acctadm/acctadm.xcl b/usr/src/cmd/acctadm/acctadm.xcl
index 4926a94690..e8d2b4572d 100644
--- a/usr/src/cmd/acctadm/acctadm.xcl
+++ b/usr/src/cmd/acctadm/acctadm.xcl
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -19,6 +18,11 @@
 #
 # CDDL HEADER END
 #
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
 msgid  "ruxf:e:d:"
 msgid  "/etc/acctadm.conf"
 msgid  ""
@@ -26,6 +30,7 @@ msgid  "process"
 msgid  "proc"
 msgid  "task"
 msgid  "flow"
+msgid  "net"
 msgid  "no"
 msgid  "none"
 msgid  "yes"
@@ -41,6 +46,10 @@ msgid  "ACCTADM_FLOW_ENABLE"
 msgid  "ACCTADM_FLOW_FILE"
 msgid  "ACCTADM_FLOW_TRACKED"
 msgid  "ACCTADM_FLOW_UNTRACKED"
+msgid  "ACCTADM_NET_ENABLE"
+msgid  "ACCTADM_NET_FILE"
+msgid  "ACCTADM_NET_TRACKED"
+msgid  "ACCTADM_NET_UNTRACKED"
 msgid  "r+"
 msgid  "r"
 msgid  " %[^=]=%s \n%n"
diff --git a/usr/src/cmd/acctadm/aconf.c b/usr/src/cmd/acctadm/aconf.c
index 70c5f7618d..8453a4fa8f 100644
--- a/usr/src/cmd/acctadm/aconf.c
+++ b/usr/src/cmd/acctadm/aconf.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/acctctl.h>
 #include <unistd.h>
@@ -32,6 +30,7 @@
 #include <stdlib.h>
 #include <errno.h>
 #include <limits.h>
+#include <libdllink.h>
 #include <libscf.h>
 #include <pwd.h>
 #include <auth_attr.h>
@@ -47,6 +46,7 @@
 #define	FMRI_FLOW_ACCT	"svc:/system/extended-accounting:flow"
 #define	FMRI_PROC_ACCT	"svc:/system/extended-accounting:process"
 #define	FMRI_TASK_ACCT	"svc:/system/extended-accounting:task"
+#define	FMRI_NET_ACCT	"svc:/system/extended-accounting:net"
 
 #define	NELEM(x)	(sizeof (x)) / (sizeof (x[0]))
 
@@ -134,13 +134,14 @@ aconf_setup(const char *fmri)
 	}
 
 	/*
-	 * Flow accounting is not available in non-global zones and
+	 * Net/Flow accounting is not available in non-global zones and
 	 * the service instance should therefore never be 'enabled' in
 	 * non-global zones.  This is enforced by acctadm(1M), but there is
 	 * nothing that prevents someone from calling svcadm enable directly,
 	 * so we handle that case here by disabling the instance.
 	 */
-	if (type == AC_FLOW && getzoneid() != GLOBAL_ZONEID) {
+	if ((type == AC_FLOW || type == AC_NET) &&
+	    getzoneid() != GLOBAL_ZONEID) {
 		(void) smf_disable_instance(fmri, 0);
 		warn(gettext("%s accounting cannot be configured in "
 		    "non-global zones\n"), ac_type_name(type));
@@ -210,6 +211,19 @@ aconf_setup(const char *fmri)
 		ret = SMF_EXIT_ERR_FATAL;
 	}
 	(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL);
+
+	if (state == AC_ON && type == AC_NET) {
+		/*
+		 * Start logging.
+		 */
+		(void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_DL_CONFIG,
+		    NULL);
+		(void) dladm_start_usagelog(strncmp(tracked, "basic",
+		    strlen("basic")) == 0 ? DLADM_LOGTYPE_LINK :
+		    DLADM_LOGTYPE_FLOW, 20);
+		(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, PRIV_SYS_DL_CONFIG,
+		    NULL);
+	}
 out:
 	aconf_scf_fini();
 	return (ret);
@@ -219,7 +233,7 @@ void
 aconf_print(FILE *fp, int types)
 {
 	acctconf_t ac;
-	int print_order[] = { AC_TASK, AC_PROC, AC_FLOW };
+	int print_order[] = { AC_TASK, AC_PROC, AC_FLOW, AC_NET };
 	int i;
 
 	for (i = 0; i < NELEM(print_order); i++) {
@@ -279,6 +293,21 @@ aconf_print_type(acctconf_t *acp, FILE *fp, int type)
 		    gettext("   Untracked flow resources: %s\n"),
 		    acp->untracked);
 		break;
+	case AC_NET:
+		(void) fprintf(fp,
+		    gettext("            Net accounting: %s\n"),
+		    acp->state == AC_ON ?
+		    gettext("active") : gettext("inactive"));
+		(void) fprintf(fp,
+		    gettext("       Net accounting file: %s\n"),
+		    acp->file);
+		(void) fprintf(fp,
+		    gettext("     Tracked net resources: %s\n"),
+		    acp->tracked);
+		(void) fprintf(fp,
+		    gettext("   Untracked net resources: %s\n"),
+		    acp->untracked);
+		break;
 	}
 }
 
@@ -369,6 +398,8 @@ aconf_type2fmri(int type)
 		return (FMRI_TASK_ACCT);
 	case AC_FLOW:
 		return (FMRI_FLOW_ACCT);
+	case AC_NET:
+		return (FMRI_NET_ACCT);
 	default:
 		die(gettext("invalid type %d\n"), type);
 	}
@@ -385,6 +416,8 @@ aconf_fmri2type(const char *fmri)
 		return (AC_TASK);
 	else if (strcmp(fmri, FMRI_FLOW_ACCT) == 0)
 		return (AC_FLOW);
+	else if (strcmp(fmri, FMRI_NET_ACCT) == 0)
+		return (AC_NET);
 	else
 		return (-1);
 }
diff --git a/usr/src/cmd/acctadm/extended-accounting.xml b/usr/src/cmd/acctadm/extended-accounting.xml
index 2c68130080..07cb9af9c1 100644
--- a/usr/src/cmd/acctadm/extended-accounting.xml
+++ b/usr/src/cmd/acctadm/extended-accounting.xml
@@ -23,8 +23,6 @@
 
  CDDL HEADER END
 
-	ident	"%Z%%M%	%I%	%E% SMI"
-
 	NOTE:  This service manifest is not editable; its contents will
 	be overwritten by package or patch operations, including
 	operating system upgrade.  Make customizations in a different
@@ -175,6 +173,43 @@
 			</documentation>
 		</template>
 	</instance>
+
+	<instance name='net' enabled='false'>
+
+		<property_group name='general' type='framework'>
+			<propval name='action_authorization' type='astring'
+				value='solaris.smf.manage.extended-accounting.net' />
+			<propval name='value_authorization' type='astring'
+				value='solaris.smf.manage.extended-accounting.net' />
+		</property_group>
+
+		<property_group name='config' type='application'>
+			<propval name='value_authorization' type='astring'
+				value='solaris.smf.value.extended-accounting.net' />
+			<propval name='enabled' type='boolean'
+				value='false' />
+			<propval name='file' type='astring'
+				value='none' />
+			<propval name='tracked' type='astring'
+				value='none' />
+			<propval name='untracked' type='astring'
+				value='extended' />
+		</property_group>
+		
+		<template>
+			<common_name>
+				<loctext xml:lang='C'>
+					configure net extended accounting
+				</loctext>
+			</common_name>
+	
+			<documentation>
+				<manpage
+					title='acctadm' section='1M'
+					manpath='/usr/share/man' />
+			</documentation>
+		</template>
+	</instance>
 	
 	<stability value='Unstable' />
 </service>
diff --git a/usr/src/cmd/acctadm/main.c b/usr/src/cmd/acctadm/main.c
index f83c1ec73c..484caf8988 100644
--- a/usr/src/cmd/acctadm/main.c
+++ b/usr/src/cmd/acctadm/main.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/acctctl.h>
 #include <assert.h>
 #include <stdio.h>
@@ -33,6 +31,7 @@
 #include <string.h>
 #include <errno.h>
 #include <libintl.h>
+#include <libdllink.h>
 #include <locale.h>
 #include <priv.h>
 #include <libscf.h>
@@ -44,12 +43,12 @@
 
 static const char USAGE[] = "\
 Usage:\n\
-    acctadm [ {process | task | flow} ]\n\
+    acctadm [ {process | task | flow | net} ]\n\
     acctadm -s\n\
-    acctadm -r [ {process | task | flow} ]\n\
-    acctadm -x|-E|-D {process | task | flow}\n\
-    acctadm -f filename {process | task | flow}\n\
-    acctadm -e resources -d resources {process | task | flow}\n";
+    acctadm -r [ {process | task | flow | net} ]\n\
+    acctadm -x|-E|-D {process | task | flow | net}\n\
+    acctadm -f filename {process | task | flow | net}\n\
+    acctadm -e resources -d resources {process | task | flow | net}\n";
 
 static const char OPTS[] = "rsxf:e:d:ED";
 
@@ -77,6 +76,7 @@ setup_privs()
 
 	(void) priv_addset(privset, PRIV_SYS_ACCT);
 	(void) priv_addset(privset, PRIV_FILE_DAC_WRITE);
+	(void) priv_addset(privset, PRIV_SYS_DL_CONFIG);
 	(void) priv_delset(privset, PRIV_FILE_LINK_ANY);
 	(void) priv_delset(privset, PRIV_PROC_EXEC);
 	(void) priv_delset(privset, PRIV_PROC_FORK);
@@ -98,10 +98,11 @@ setup_privs()
 		die(gettext("cannot setup privileges"));
 
 	/*
-	 * Turn off the sys_acct and file_dac_write privileges until needed.
+	 * Turn off the sys_acct, file_dac_write and dl_config privileges
+	 * until needed.
 	 */
 	(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, PRIV_FILE_DAC_WRITE,
-	    PRIV_SYS_ACCT, NULL);
+	    PRIV_SYS_ACCT, PRIV_SYS_DL_CONFIG, NULL);
 }
 
 int
@@ -183,7 +184,7 @@ main(int argc, char *argv[])
 		if (!(disabled || enabled || Dflg || Eflg || file || sflg ||
 		    xflg))
 			(void) priv_set(PRIV_OFF, PRIV_PERMITTED,
-			    PRIV_SYS_ACCT, NULL);
+			    PRIV_SYS_ACCT, PRIV_SYS_DL_CONFIG, NULL);
 
 		if (optind < argc) {
 			if (typestr != NULL) {
@@ -203,20 +204,34 @@ main(int argc, char *argv[])
 			type |= AC_TASK;
 		else if (strcmp(typestr, "flow") == 0)
 			type |= AC_FLOW;
+		else if (strcmp(typestr, "net") == 0)
+			type |= AC_NET;
 		else {
 			warn(gettext("unknown accounting type -- %s\n"),
 			    typestr);
 			usage();
 		}
 	} else
-		type = AC_PROC | AC_TASK | AC_FLOW;
+		type = AC_PROC | AC_TASK | AC_FLOW | AC_NET;
 
 	/*
+	 * Drop the DL config privilege if we are not working with
+	 * net.
+	 */
+	if ((type & AC_NET) == 0) {
+		(void) priv_set(PRIV_OFF, PRIV_PERMITTED,
+		    PRIV_SYS_DL_CONFIG, NULL);
+	}
+	/*
 	 * check for invalid options
 	 */
 	if (optcnt > 1)
 		usage();
 
+	/*
+	 * XXX For AC_NET, enabled/disabled should only be "basic" or
+	 * "extended" - need to check it here.
+	 */
 	if ((enabled || disabled) && (rflg || Dflg || sflg || xflg || Eflg))
 		usage();
 
@@ -253,9 +268,10 @@ main(int argc, char *argv[])
 		return (E_ERROR);
 	}
 
-	assert(type == AC_PROC || type == AC_TASK || type == AC_FLOW);
+	assert(type == AC_PROC || type == AC_TASK || type == AC_FLOW ||
+	    type == AC_NET);
 
-	if (type == AC_FLOW && getzoneid() != GLOBAL_ZONEID)
+	if ((type == AC_FLOW || type == AC_NET) && getzoneid() != GLOBAL_ZONEID)
 		die(gettext("%s accounting cannot be configured in "
 		    "non-global zones\n"), ac_type_name(type));
 
@@ -277,6 +293,18 @@ main(int argc, char *argv[])
 		/*
 		 * Turn off the specified accounting and close its file
 		 */
+
+		/*
+		 * Stop net logging before turning it off so that the last
+		 * set of logs can be written.
+		 */
+		if (type & AC_NET) {
+			(void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
+			    PRIV_SYS_DL_CONFIG, NULL);
+			(void) dladm_stop_usagelog(DLADM_LOGTYPE_FLOW);
+			(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
+			    PRIV_SYS_DL_CONFIG, NULL);
+		}
 		state = AC_OFF;
 
 		(void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL);
@@ -311,8 +339,22 @@ main(int argc, char *argv[])
 			free(buf);
 			die(gettext("cannot obtain list of resources\n"));
 		}
-		if (disabled)
+		if (disabled) {
+			/*
+			 * Stop net logging before turning it off so that the
+			 * last set of logs can be written.
+			 */
+			if (type & AC_NET) {
+				(void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
+				    PRIV_SYS_DL_CONFIG, NULL);
+				(void) dladm_stop_usagelog(strncmp(disabled,
+				    "basic", strlen("basic")) == 0 ?
+				    DLADM_LOGTYPE_LINK : DLADM_LOGTYPE_FLOW);
+				(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
+				    PRIV_SYS_DL_CONFIG, NULL);
+			}
 			str2buf(buf, disabled, AC_OFF, type);
+		}
 		if (enabled)
 			str2buf(buf, enabled, AC_ON, type);
 
@@ -332,6 +374,24 @@ main(int argc, char *argv[])
 		if (aconf_set_string(AC_PROP_UNTRACKED, untracked) == -1)
 			die(gettext("cannot update %s property\n"),
 			    AC_PROP_UNTRACKED);
+		/*
+		 * We will enable net logging after turning it on so that
+		 * it can immediately start writing log.
+		 */
+		if (type & AC_NET && enabled != NULL) {
+			/*
+			 * Default logging interval for AC_NET is 20.
+			 * XXX need to find the right place to
+			 * configure it.
+			 */
+			(void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
+			    PRIV_SYS_DL_CONFIG, NULL);
+			(void) dladm_start_usagelog(strncmp(enabled, "basic",
+			    strlen("basic")) == 0 ? DLADM_LOGTYPE_LINK :
+			    DLADM_LOGTYPE_FLOW, 20);
+			(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
+			    PRIV_SYS_DL_CONFIG, NULL);
+		}
 		free(tracked);
 		free(untracked);
 		free(buf);
@@ -365,6 +425,18 @@ main(int argc, char *argv[])
 		/*
 		 * Disable accounting
 		 */
+
+		/*
+		 * Stop net logging before turning it off so that the last
+		 * set of logs can be written.
+		 */
+		if (type & AC_NET) {
+			(void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
+			    PRIV_SYS_DL_CONFIG, NULL);
+			(void) dladm_stop_usagelog(DLADM_LOGTYPE_FLOW);
+			(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
+			    PRIV_SYS_DL_CONFIG, NULL);
+		}
 		state = AC_OFF;
 
 		(void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL);
@@ -395,6 +467,17 @@ main(int argc, char *argv[])
 			die(gettext("cannot update %s property\n"),
 			    AC_PROP_STATE);
 		modified++;
+		if (type & AC_NET) {
+			/*
+			 * Default logging interval for AC_NET is 20,
+			 * XXX need to find the right place to configure it.
+			 */
+			(void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
+			    PRIV_SYS_DL_CONFIG, NULL);
+			(void) dladm_start_usagelog(DLADM_LOGTYPE_FLOW, 20);
+			(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
+			    PRIV_SYS_DL_CONFIG, NULL);
+		}
 	}
 	(void) priv_set(PRIV_OFF, PRIV_PERMITTED, PRIV_SYS_ACCT, NULL);
 
diff --git a/usr/src/cmd/acctadm/res.c b/usr/src/cmd/acctadm/res.c
index 844e3641c1..7f9484f12b 100644
--- a/usr/src/cmd/acctadm/res.c
+++ b/usr/src/cmd/acctadm/res.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <stdlib.h>
 #include <stdio.h>
 #include <libintl.h>
@@ -89,6 +87,33 @@ static ac_resname_t ac_names[] = {
 	{ AC_FLOW,	AC_FLOW_ANAME,		"action"	},
 
 	/*
+	 * Net accounting resources
+	 */
+
+	{ AC_NET,	AC_NET_NAME,		"name"		},
+	{ AC_NET,	AC_NET_EHOST,		"ehost"		},
+	{ AC_NET,	AC_NET_EDEST,		"edest"		},
+	{ AC_NET,	AC_NET_VLAN_TPID,	"vlan_pid"	},
+	{ AC_NET,	AC_NET_VLAN_TCI,	"vlan_tci"	},
+	{ AC_NET,	AC_NET_SAP,		"sap"		},
+	{ AC_NET,	AC_NET_PRIORITY,	"priority"	},
+	{ AC_NET,	AC_NET_BWLIMIT,		"bwlimit"	},
+	{ AC_NET,	AC_NET_DEVNAME,		"devname"	},
+	{ AC_NET,	AC_NET_SADDR,		"src_ip"	},
+	{ AC_NET,	AC_NET_DADDR,		"dst_ip"	},
+	{ AC_NET,	AC_NET_SPORT,		"src_port"	},
+	{ AC_NET,	AC_NET_DPORT,		"dst_port"	},
+	{ AC_NET,	AC_NET_PROTOCOL,	"protocol"	},
+	{ AC_NET,	AC_NET_DSFIELD,		"dsfield"	},
+	{ AC_NET,	AC_NET_CURTIME,		"curtime"	},
+	{ AC_NET,	AC_NET_IBYTES,		"ibytes"	},
+	{ AC_NET,	AC_NET_OBYTES,		"obytes"	},
+	{ AC_NET,	AC_NET_IPKTS,		"ipkts"		},
+	{ AC_NET,	AC_NET_OPKTS,		"opkts"		},
+	{ AC_NET,	AC_NET_IERRPKTS,	"ierrpkts"	},
+	{ AC_NET,	AC_NET_OERRPKTS,	"oerrpkts"	},
+
+	/*
 	 * These are included for compatibility with old acctadm that
 	 * didn't have resource groups for individual accounting types.
 	 * It was possible to have resource "pid" enabled for task
@@ -134,6 +159,19 @@ static ac_group_t ac_groups[] = {
 		{ AC_FLOW_SADDR, AC_FLOW_DADDR, AC_FLOW_SPORT, AC_FLOW_DPORT,
 		AC_FLOW_PROTOCOL, AC_FLOW_NBYTES, AC_FLOW_NPKTS, AC_FLOW_ANAME,
 		AC_NONE } },
+	{ AC_NET,	"extended",
+		{ AC_NET_NAME, AC_NET_EHOST, AC_NET_EDEST, AC_NET_VLAN_TPID,
+		AC_NET_VLAN_TCI, AC_NET_SAP, AC_NET_PRIORITY,
+		AC_NET_BWLIMIT, AC_NET_DEVNAME, AC_NET_SADDR, AC_NET_DADDR,
+		AC_NET_SPORT, AC_NET_DPORT, AC_NET_PROTOCOL, AC_NET_DSFIELD,
+		AC_NET_CURTIME, AC_NET_IBYTES, AC_NET_OBYTES, AC_NET_IPKTS,
+		AC_NET_OPKTS, AC_NET_IERRPKTS, AC_NET_OERRPKTS, AC_NONE } },
+	{ AC_NET,	"basic",
+		{ AC_NET_NAME, AC_NET_DEVNAME, AC_NET_EHOST, AC_NET_EDEST,
+		AC_NET_VLAN_TPID, AC_NET_VLAN_TCI, AC_NET_SAP,
+		AC_NET_PRIORITY, AC_NET_BWLIMIT, AC_NET_CURTIME, AC_NET_IBYTES,
+		AC_NET_OBYTES, AC_NET_IPKTS, AC_NET_OPKTS, AC_NET_IERRPKTS,
+		AC_NET_OERRPKTS, AC_NONE } },
 	{ AC_NONE,	NULL,
 		{ AC_NONE } }
 };
@@ -202,9 +240,10 @@ printgroups(int type)
 {
 	int header = 0;
 
-	if ((type & AC_PROC) && (type & AC_TASK) && (type & AC_FLOW))
+	if ((type & AC_PROC) && (type & AC_TASK) && (type & AC_FLOW) &&
+	    (type & AC_NET)) {
 		header = 1;
-
+	}
 	if (type & AC_PROC) {
 		if (header == 1)
 			(void) printf("process:\n");
@@ -220,6 +259,11 @@ printgroups(int type)
 			(void) printf("flow:\n");
 		printgroup(AC_FLOW);
 	}
+	if (type & AC_NET) {
+		if (header == 1)
+			(void) printf("net:\n");
+		printgroup(AC_NET);
+	}
 }
 
 /*
diff --git a/usr/src/cmd/acctadm/utils.c b/usr/src/cmd/acctadm/utils.c
index 26482d5ccd..bbee653eeb 100644
--- a/usr/src/cmd/acctadm/utils.c
+++ b/usr/src/cmd/acctadm/utils.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <assert.h>
 #include <sys/types.h>
 #include <sys/acctctl.h>
@@ -107,6 +105,8 @@ ac_type_name(int type)
 		return (gettext("flow"));
 	case AC_TASK:
 		return (gettext("task"));
+	case AC_NET:
+		return (gettext("net"));
 	default:
 		die(gettext("invalid type %d\n"), type);
 	}
@@ -217,8 +217,9 @@ verify_exacct_file(const char *file, int type)
 		} else {
 			/*
 			 * A non-header object exists.  Insist that it be
-			 * either a process, task, or flow accounting record,
-			 * the same type as is desired.
+			 * either a process, task, flow  or net accounting
+			 * record, the same type as is desired.
+			 * xxx-venu:check 101 merge for EXD_GROUP_NET_*
 			 */
 			uint_t c = eo.eo_catalog & EXD_DATA_MASK;
 
@@ -226,7 +227,12 @@ verify_exacct_file(const char *file, int type)
 			    (eo.eo_catalog & EXC_CATALOG_MASK) != EXC_NONE ||
 			    (!(c == EXD_GROUP_PROC && type == AC_PROC ||
 			    c == EXD_GROUP_TASK && type == AC_TASK ||
-			    c == EXD_GROUP_FLOW && type == AC_FLOW))) {
+			    c == EXD_GROUP_FLOW && type == AC_FLOW ||
+			    (c == EXD_GROUP_NET_LINK_DESC ||
+			    c == EXD_GROUP_NET_FLOW_DESC ||
+			    c == EXD_GROUP_NET_LINK_STATS ||
+			    c == EXD_GROUP_NET_FLOW_STATS) &&
+			    type == AC_NET))) {
 				(void) ea_close(&ef);
 				return (B_FALSE);
 			}
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile
index 4924d2fe4e..69e91758ea 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile
@@ -22,7 +22,6 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
 #
 
 PROG =		ifconfig
@@ -39,7 +38,7 @@ COMMONSRCS=	$(CMDINETCOMMONDIR)/$(COMMONOBJS:%.o=%.c)
 SRCS=		$(LOCALSRCS) $(COMMONSRCS)
 
 CPPFLAGS +=	-I$(CMDINETCOMMONDIR) -I$(SRC)/common/net/dhcp
-LDLIBS +=       -ldhcpagent -linetcfg -ldlpi
+LDLIBS +=       -ldhcpagent -linetcfg -ldlpi -ldladm
 LINTFLAGS +=	-m
 
 ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%)
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
index b33fc6c1b6..79e2991164 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
@@ -13,6 +13,7 @@
 #include "ifconfig.h"
 #include <compat.h>
 #include <libdlpi.h>
+#include <libdllink.h>
 #include <inet/ip.h>
 #include <inet/ipsec_impl.h>
 
@@ -4499,7 +4500,11 @@ static boolean_t
 ni_entry(const char *linkname, void *arg)
 {
 	dlpi_handle_t	dh;
+	datalink_class_t class;
 
+	(void) dladm_name2info(linkname, NULL, NULL, &class, NULL);
+	if (class == DATALINK_CLASS_ETHERSTUB)
+		return (_B_FALSE);
 	if (dlpi_open(linkname, &dh, 0) != DLPI_SUCCESS)
 		return (_B_FALSE);
 
diff --git a/usr/src/cmd/dladm/Makefile b/usr/src/cmd/dladm/Makefile
index 94e6842ff3..6757c63d89 100644
--- a/usr/src/cmd/dladm/Makefile
+++ b/usr/src/cmd/dladm/Makefile
@@ -22,7 +22,6 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
 #
 
 PROG= dladm
@@ -35,6 +34,7 @@ ROOTCFGFILES=	$(CFGFILES:%=$(ROOTCFGDIR)/%)
 include ../Makefile.cmd
 
 XGETFLAGS += -a -x $(PROG).xcl
+LDLIBS += -L$(ROOT)/lib -lsocket
 LDLIBS += -ldladm -ldlpi -lkstat -lsecdb -lbsm -linetutil -ldevinfo
 
 $(ROOTCFGFILES) := OWNER= dladm
diff --git a/usr/src/cmd/dladm/dladm.c b/usr/src/cmd/dladm/dladm.c
index 466adfe6c0..9422a31da3 100644
--- a/usr/src/cmd/dladm/dladm.c
+++ b/usr/src/cmd/dladm/dladm.c
@@ -46,7 +46,9 @@
 #include <libintl.h>
 #include <libdevinfo.h>
 #include <libdlpi.h>
+#include <libdladm.h>
 #include <libdllink.h>
+#include <libdlstat.h>
 #include <libdlaggr.h>
 #include <libdlwlan.h>
 #include <libdlvlan.h>
@@ -54,11 +56,18 @@
 #include <libinetutil.h>
 #include <bsm/adt.h>
 #include <bsm/adt_event.h>
+#include <libdlvnic.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/processor.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <net/if_types.h>
 #include <stddef.h>
 
-#define	AGGR_DRV		"aggr"
 #define	STR_UNDEF_VAL		"--"
 #define	MAXPORT			256
+#define	MAXVNIC			256
 #define	BUFLEN(lim, ptr)	(((lim) > (ptr)) ? ((lim) - (ptr)) : 0)
 #define	MAXLINELEN		1024
 #define	SMF_UPGRADE_FILE		"/var/svc/profile/upgrade"
@@ -131,9 +140,7 @@
  * with a callback function that will be called for each field to be printed.
  * The callback function will be passed a pointer to the print_field_t
  * for the field, and the pf_index may then be used to identify the
- * system call required to find the value to be printed. An example of
- * this implementation may be found in the do_show_dev() and print_dev()
- * invocation.
+ * system call required to find the value to be printed.
  */
 
 typedef struct print_field_s {
@@ -192,15 +199,6 @@ static char *dladm_print_field(print_field_t *, void *);
 #define	MAX_FIELD_LEN	32
 
 
-typedef struct pktsum_s {
-	uint64_t	ipackets;
-	uint64_t	opackets;
-	uint64_t	rbytes;
-	uint64_t	obytes;
-	uint32_t	ierrors;
-	uint32_t	oerrors;
-} pktsum_t;
-
 typedef struct show_state {
 	boolean_t	ls_firstonly;
 	boolean_t	ls_donefirst;
@@ -210,6 +208,8 @@ typedef struct show_state {
 	print_state_t	ls_print;
 	boolean_t	ls_parseable;
 	boolean_t	ls_printheader;
+	boolean_t	ls_mac;
+	boolean_t	ls_hwgrp;
 } show_state_t;
 
 typedef struct show_grp_state {
@@ -226,9 +226,37 @@ typedef struct show_grp_state {
 	print_state_t	gs_print;
 } show_grp_state_t;
 
+typedef struct show_vnic_state {
+	datalink_id_t	vs_vnic_id;
+	datalink_id_t	vs_link_id;
+	char		vs_vnic[MAXLINKNAMELEN];
+	char		vs_link[MAXLINKNAMELEN];
+	boolean_t	vs_parseable;
+	boolean_t	vs_printheader;
+	boolean_t	vs_found;
+	boolean_t	vs_firstonly;
+	boolean_t	vs_donefirst;
+	boolean_t	vs_stats;
+	boolean_t	vs_printstats;
+	pktsum_t	vs_totalstats;
+	pktsum_t	vs_prevstats[MAXVNIC];
+	boolean_t	vs_etherstub;
+	dladm_status_t	vs_status;
+	uint32_t	vs_flags;
+	print_state_t	vs_print;
+} show_vnic_state_t;
+
+typedef struct show_usage_state_s {
+	boolean_t	us_plot;
+	boolean_t	us_parseable;
+	boolean_t	us_printheader;
+	boolean_t	us_first;
+	print_state_t	us_print;
+} show_usage_state_t;
+
 typedef void cmdfunc_t(int, char **, const char *);
 
-static cmdfunc_t do_show_link, do_show_dev, do_show_wifi, do_show_phys;
+static cmdfunc_t do_show_link, do_show_wifi, do_show_phys;
 static cmdfunc_t do_create_aggr, do_delete_aggr, do_add_aggr, do_remove_aggr;
 static cmdfunc_t do_modify_aggr, do_show_aggr, do_up_aggr;
 static cmdfunc_t do_scan_wifi, do_connect_wifi, do_disconnect_wifi;
@@ -239,21 +267,25 @@ static cmdfunc_t do_create_vlan, do_delete_vlan, do_up_vlan, do_show_vlan;
 static cmdfunc_t do_rename_link, do_delete_phys, do_init_phys;
 static cmdfunc_t do_show_linkmap;
 static cmdfunc_t do_show_ether;
+static cmdfunc_t do_create_vnic, do_delete_vnic, do_show_vnic;
+static cmdfunc_t do_up_vnic;
+static cmdfunc_t do_create_etherstub, do_delete_etherstub, do_show_etherstub;
+static cmdfunc_t do_show_usage;
+
+static void 	do_up_vnic_common(int, char **, const char *, boolean_t);
 
 static void	altroot_cmd(char *, int, char **);
 static int	show_linkprop_onelink(datalink_id_t, void *);
 
 static void	link_stats(datalink_id_t, uint_t, char *, show_state_t *);
 static void	aggr_stats(datalink_id_t, show_grp_state_t *, uint_t);
-static void	dev_stats(const char *dev, uint32_t, char *, show_state_t *);
+static void	vnic_stats(show_vnic_state_t *, uint32_t);
 
 static int	get_one_kstat(const char *, const char *, uint8_t,
 		    void *, boolean_t);
 static void	get_mac_stats(const char *, pktsum_t *);
 static void	get_link_stats(const char *, pktsum_t *);
 static uint64_t	get_ifspeed(const char *, boolean_t);
-static void	stats_total(pktsum_t *, pktsum_t *, pktsum_t *);
-static void	stats_diff(pktsum_t *, pktsum_t *, pktsum_t *);
 static const char	*get_linkstate(const char *, boolean_t, char *);
 static const char	*get_linkduplex(const char *, boolean_t, char *);
 
@@ -286,8 +318,6 @@ static cmd_t	cmds[] = {
 	    "\tshow-link\t[-pP] [-o <field>,..] [-s [-i <interval>]] [<link>]"},
 	{ "rename-link",	do_rename_link,
 	    "\trename-link\t[-R <root-dir>] <oldlink> <newlink>\n"	},
-	{ "show-dev",		do_show_dev,
-	    "\tshow-dev\t[-p] [-o <field>,..] [-s [-i <interval>]] [<dev>]\n" },
 	{ "create-aggr",	do_create_aggr,
 	    "\tcreate-aggr\t[-t] [-R <root-dir>] [-P <policy>] [-L <mode>]\n"
 	    "\t\t\t[-T <time>] [-u <address>] [-l <link>] ... <link>"	},
@@ -343,9 +373,30 @@ static cmd_t	cmds[] = {
 	{ "delete-phys",	do_delete_phys,
 	    "\tdelete-phys\t<link>"					},
 	{ "show-phys",		do_show_phys,
-	    "\tshow-phys\t[-pP] [-o <field>,..] [<link>]"		},
+	    "\tshow-phys\t[-pP] [-o <field>,..] [-H] [<link>]"		},
 	{ "init-phys",		do_init_phys,		NULL		},
-	{ "show-linkmap",	do_show_linkmap,	NULL		}
+	{ "show-linkmap",	do_show_linkmap,	NULL		},
+	{ "create-vnic",	do_create_vnic,
+	    "\tcreate-vnic     [-t] [-R <root-dir>] -l <link> [-m <value> |"
+	    " auto |\n"
+	    "\t                {factory [-n <slot-identifier>]} |\n"
+	    "\t                {random [-r <prefix>]}] [-v vlan-tag [-f]]\n"
+	    "\t                -p <prop>=<value>[,...] [-H]"
+	    " <vnic-link>\n"	},
+	{ "delete-vnic",	do_delete_vnic,
+	    "\tdelete-vnic     [-t] [-R <root-dir>] <vnic-link>\n" 	},
+	{ "show-vnic",		do_show_vnic,
+	    "\tshow-vnic       [-pP] [-l <link>] [-s [-i <interval>]]"	},
+	{ "up-vnic",		do_up_vnic,		NULL		},
+	{ "create-etherstub",	do_create_etherstub,
+	    "\tcreate-etherstub [-t] [-R <root-dir>] <link>\n"		},
+	{ "delete-etherstub",	do_delete_etherstub,
+	    "\tdelete-etherstub [-t] [-R <root-dir>] <link>\n"		},
+	{ "show-etherstub",	do_show_etherstub,
+	    "\tshow-etherstub  [-t] [-R <root-dir>] [<link>]\n" 	},
+	{ "show-usage",		do_show_usage,
+	    "\tshow-usage      [-d|-p -F <format>] [-f <filename>]\n"
+	    "\t                [-s <time>] [-e <time>] <link>\n"	}
 };
 
 static const struct option lopts[] = {
@@ -360,11 +411,15 @@ static const struct option lopts[] = {
 	{"root-dir",	required_argument,	0, 'R'},
 	{"link",	required_argument,	0, 'l'},
 	{"forcible",	no_argument,		0, 'f'},
+	{"bw-limit",	required_argument,	0, 'b'},
+	{"mac-address",	required_argument,	0, 'm'},
+	{"slot",	required_argument,	0, 'n'},
 	{ 0, 0, 0, 0 }
 };
 
 static const struct option show_lopts[] = {
 	{"statistics",	no_argument,		0, 's'},
+	{"continuous",	no_argument,		0, 'S'},
 	{"interval",	required_argument,	0, 'i'},
 	{"parseable",	no_argument,		0, 'p'},
 	{"extended",	no_argument,		0, 'x'},
@@ -409,6 +464,24 @@ static const struct option showeth_lopts[] = {
 	{ 0, 0, 0, 0 }
 };
 
+static const struct option vnic_lopts[] = {
+	{"temporary",	no_argument,		0, 't'	},
+	{"root-dir",	required_argument,	0, 'R'	},
+	{"dev",		required_argument,	0, 'd'	},
+	{"mac-address",	required_argument,	0, 'm'	},
+	{"cpus",	required_argument,	0, 'c'	},
+	{"bw-limit",	required_argument,	0, 'b'	},
+	{"slot",	required_argument,	0, 'n'	},
+	{"mac-prefix",	required_argument,	0, 'r'	},
+	{ 0, 0, 0, 0 }
+};
+
+static const struct option etherstub_lopts[] = {
+	{"temporary",	no_argument,		0, 't'	},
+	{"root-dir",	required_argument,	0, 'R'	},
+	{ 0, 0, 0, 0 }
+};
+
 /*
  * structures for 'dladm show-ether'
  */
@@ -451,26 +524,7 @@ typedef struct print_ether_state {
 } print_ether_state_t;
 
 /*
- * structures for 'dladm show-dev'.
- */
-typedef enum {
-	DEV_LINK,
-	DEV_STATE,
-	DEV_SPEED,
-	DEV_DUPLEX
-} dev_field_index_t;
-
-static print_field_t dev_fields[] = {
-/* name,	header,		field width,	index,		cmdtype */
-{ "link",	"LINK",			15,	DEV_LINK,	CMD_TYPE_ANY},
-{ "state",	"STATE",		6,	DEV_STATE,	CMD_TYPE_ANY},
-{ "speed",	"SPEED",		8,	DEV_SPEED,	CMD_TYPE_ANY},
-{ "duplex",	"DUPLEX",		8,	DEV_DUPLEX,	CMD_TYPE_ANY}}
-;
-#define	DEV_MAX_FIELDS	(sizeof (dev_fields) / sizeof (print_field_t))
-
-/*
- * structures for 'dladm show-dev -s' (print statistics)
+ * structures for 'dladm show-link -s' (print statistics)
  */
 typedef enum {
 	DEVS_LINK,
@@ -493,12 +547,6 @@ static print_field_t devs_fields[] = {
 { "oerrors",	"OERRORS",		8,	DEVS_OERRORS,	CMD_TYPE_ANY}}
 ;
 #define	DEVS_MAX_FIELDS	(sizeof (devs_fields) / sizeof (print_field_t))
-typedef struct dev_args_s {
-	char		*devs_link;
-	pktsum_t 	*devs_psum;
-} dev_args_t;
-static char *print_dev_stats(print_field_t *, void *);
-static char *print_dev(print_field_t *, void *);
 
 /*
  * buffer used by print functions for show-{link,phys,vlan} commands.
@@ -635,10 +683,10 @@ static print_field_t aggr_s_fields[] = {
     CMD_TYPE_ANY}}
 ;
 #define	AGGR_S_MAX_FIELDS \
-	(sizeof (aggr_l_fields) / sizeof (print_field_t))
+	(sizeof (aggr_s_fields) / sizeof (print_field_t))
 
 /*
- * structures for 'dladm show-dev -L'.
+ * structures for 'dladm show-aggr -L'.
  */
 typedef enum {
 	AGGR_L_LINK,
@@ -697,6 +745,50 @@ static print_field_t phys_fields[] = {
 #define	PHYS_MAX_FIELDS	(sizeof (phys_fields) / sizeof (print_field_t))
 
 /*
+ * structures for 'dladm show-phys -m'
+ */
+
+typedef enum {
+	PHYS_M_LINK,
+	PHYS_M_SLOT,
+	PHYS_M_ADDRESS,
+	PHYS_M_INUSE,
+	PHYS_M_CLIENT
+} phys_m_field_index_t;
+
+static print_field_t phys_m_fields[] = {
+/* name,	header,		field width,	offset,	cmdtype		*/
+{ "link",	"LINK",		12,	PHYS_M_LINK,	CMD_TYPE_ANY},
+{ "slot",	"SLOT",		8,	PHYS_M_SLOT,	CMD_TYPE_ANY},
+{ "address",	"ADDRESS",	18,	PHYS_M_ADDRESS,	CMD_TYPE_ANY},
+{ "inuse",	"INUSE",	4,	PHYS_M_INUSE,	CMD_TYPE_ANY},
+{ "client",	"CLIENT",	12,	PHYS_M_CLIENT,	CMD_TYPE_ANY}}
+;
+#define	PHYS_M_MAX_FIELDS (sizeof (phys_m_fields) / sizeof (print_field_t))
+
+/*
+ * structures for 'dladm show-phys -H'
+ */
+
+typedef enum {
+	PHYS_H_LINK,
+	PHYS_H_GROUP,
+	PHYS_H_GRPTYPE,
+	PHYS_H_RINGS,
+	PHYS_H_CLIENTS
+} phys_h_field_index_t;
+
+static print_field_t phys_h_fields[] = {
+/* name,	header,		field width,	offset,	cmdtype		*/
+{ "link",	"LINK",		12,	PHYS_H_LINK,	CMD_TYPE_ANY},
+{ "group",	"GROUP",	8,	PHYS_H_GROUP,	CMD_TYPE_ANY},
+{ "grouptype",	"TYPE",		6,	PHYS_H_GRPTYPE,	CMD_TYPE_ANY},
+{ "rings",	"NUM-RINGS",	16,	PHYS_H_RINGS,	CMD_TYPE_ANY},
+{ "clients",	"CLIENTS",	20,	PHYS_H_CLIENTS,	CMD_TYPE_ANY}}
+;
+#define	PHYS_H_MAX_FIELDS (sizeof (phys_h_fields) / sizeof (print_field_t))
+
+/*
  * structures for 'dladm show-vlan'
  */
 static print_field_t vlan_fields[] = {
@@ -712,6 +804,7 @@ static print_field_t vlan_fields[] = {
 ;
 #define	VLAN_MAX_FIELDS	(sizeof (vlan_fields) / sizeof (print_field_t))
 
+
 /*
  * structures for 'dladm show-wifi'
  */
@@ -764,34 +857,28 @@ static print_field_t linkprop_fields[] = {
 #define	LINKPROP_MAX_FIELDS					\
 	(sizeof (linkprop_fields) / sizeof (print_field_t))
 
-#define	MAX_PROPS		32
 #define	MAX_PROP_LINE		512
 
-typedef struct prop_info {
-	char		*pi_name;
-	char		*pi_val[DLADM_MAX_PROP_VALCNT];
-	uint_t		pi_count;
-} prop_info_t;
-
-typedef struct prop_list {
-	prop_info_t	pl_info[MAX_PROPS];
-	uint_t		pl_count;
-	char		*pl_buf;
-} prop_list_t;
-
 typedef struct show_linkprop_state {
-	char		ls_link[MAXLINKNAMELEN];
-	char		*ls_line;
-	char		**ls_propvals;
-	prop_list_t	*ls_proplist;
-	boolean_t	ls_parseable;
-	boolean_t	ls_persist;
-	boolean_t	ls_header;
-	dladm_status_t	ls_status;
-	dladm_status_t	ls_retstatus;
-	print_state_t	ls_print;
+	char			ls_link[MAXLINKNAMELEN];
+	char			*ls_line;
+	char			**ls_propvals;
+	dladm_arg_list_t	*ls_proplist;
+	boolean_t		ls_parseable;
+	boolean_t		ls_persist;
+	boolean_t		ls_header;
+	dladm_status_t		ls_status;
+	dladm_status_t		ls_retstatus;
+	print_state_t		ls_print;
 } show_linkprop_state_t;
 
+typedef struct set_linkprop_state {
+	const char		*ls_name;
+	boolean_t		ls_reset;
+	boolean_t		ls_temp;
+	dladm_status_t		ls_status;
+} set_linkprop_state_t;
+
 typedef struct linkprop_args_s {
 	show_linkprop_state_t	*ls_state;
 	char			*ls_propname;
@@ -817,9 +904,108 @@ static print_field_t secobj_fields[] = {
 ;
 #define	DEV_SOBJ_FIELDS	(sizeof (secobj_fields) / sizeof (print_field_t))
 
+/*
+ * structures for 'dladm show-vnic'
+ */
+typedef struct vnic_fields_buf_s
+{
+	char vnic_link[DLPI_LINKNAME_MAX];
+	char vnic_over[DLPI_LINKNAME_MAX];
+	char vnic_speed[6];
+	char vnic_macaddr[19];
+	char vnic_macaddrtype[19];
+	char vnic_vid[6];
+} vnic_fields_buf_t;
+
+static print_field_t vnic_fields[] = {
+/* name,		header,		field width,	offset,	cmdtype	*/
+{ "link",		"LINK",		12,
+    offsetof(vnic_fields_buf_t, vnic_link),		CMD_TYPE_ANY},
+{ "over",		"OVER",		12,
+    offsetof(vnic_fields_buf_t, vnic_over),		CMD_TYPE_ANY},
+{ "speed",		"SPEED",	6,
+    offsetof(vnic_fields_buf_t, vnic_speed),		CMD_TYPE_ANY},
+{ "macaddr",		"MACADDRESS",	20,
+    offsetof(vnic_fields_buf_t, vnic_macaddr),		CMD_TYPE_ANY},
+{ "macaddrtype",	"MACADDRTYPE",	19,
+    offsetof(vnic_fields_buf_t, vnic_macaddrtype),	CMD_TYPE_ANY},
+{ "vid",		"VID",		6,
+    offsetof(vnic_fields_buf_t, vnic_vid),		CMD_TYPE_ANY}}
+;
+#define	VNIC_MAX_FIELDS	(sizeof (vnic_fields) / sizeof (print_field_t))
+
+/*
+ * structures for 'dladm show-usage'
+ */
+
+typedef struct  usage_fields_buf_s {
+	char	usage_link[12];
+	char	usage_duration[10];
+	char	usage_ipackets[9];
+	char	usage_rbytes[10];
+	char	usage_opackets[9];
+	char	usage_obytes[10];
+	char	usage_bandwidth[14];
+} usage_fields_buf_t;
+
+static print_field_t usage_fields[] = {
+/* name,	header,		field width,	offset,	cmdtype		*/
+{ "link",	"LINK",			12,
+    offsetof(usage_fields_buf_t, usage_link),		CMD_TYPE_ANY},
+{ "duration",	"DURATION",		10,
+    offsetof(usage_fields_buf_t, usage_duration),	CMD_TYPE_ANY},
+{ "ipackets",	"IPACKETS",		9,
+    offsetof(usage_fields_buf_t, usage_ipackets),	CMD_TYPE_ANY},
+{ "rbytes",	"RBYTES",		10,
+    offsetof(usage_fields_buf_t, usage_rbytes),		CMD_TYPE_ANY},
+{ "opackets",	"OPACKETS",		9,
+    offsetof(usage_fields_buf_t, usage_opackets),	CMD_TYPE_ANY},
+{ "obytes",	"OBYTES",		10,
+    offsetof(usage_fields_buf_t, usage_obytes),		CMD_TYPE_ANY},
+{ "bandwidth",	"BANDWIDTH",		14,
+    offsetof(usage_fields_buf_t, usage_bandwidth),	CMD_TYPE_ANY}}
+;
+
+#define	USAGE_MAX_FIELDS	(sizeof (usage_fields) / sizeof (print_field_t))
+
+/*
+ * structures for 'dladm show-usage link'
+ */
+
+typedef struct  usage_l_fields_buf_s {
+	char	usage_l_link[12];
+	char	usage_l_stime[13];
+	char	usage_l_etime[13];
+	char	usage_l_rbytes[8];
+	char	usage_l_obytes[8];
+	char	usage_l_bandwidth[14];
+} usage_l_fields_buf_t;
+
+static print_field_t usage_l_fields[] = {
+/* name,	header,		field width,	offset,	cmdtype		*/
+{ "link",	"LINK",		12,
+    offsetof(usage_l_fields_buf_t, usage_l_link),	CMD_TYPE_ANY},
+{ "start",	"START",	13,
+    offsetof(usage_l_fields_buf_t, usage_l_stime),	CMD_TYPE_ANY},
+{ "end",	"END",		13,
+    offsetof(usage_l_fields_buf_t, usage_l_etime),	CMD_TYPE_ANY},
+{ "rbytes",	"RBYTES",	8,
+    offsetof(usage_l_fields_buf_t, usage_l_rbytes),	CMD_TYPE_ANY},
+{ "obytes",	"OBYTES",	8,
+    offsetof(usage_l_fields_buf_t, usage_l_obytes),	CMD_TYPE_ANY},
+{ "bandwidth",	"BANDWIDTH",	14,
+    offsetof(usage_l_fields_buf_t, usage_l_bandwidth),	CMD_TYPE_ANY}}
+;
+
+#define	USAGE_L_MAX_FIELDS \
+	(sizeof (usage_l_fields) /sizeof (print_field_t))
+
 static char *progname;
 static sig_atomic_t signalled;
 
+#define	DLADM_ETHERSTUB_NAME	"etherstub"
+#define	DLADM_IS_ETHERSTUB(id)	(id == DATALINK_INVALID_LINKID)
+
 static void
 usage(void)
 {
@@ -867,6 +1053,254 @@ main(int argc, char *argv[])
 	return (0);
 }
 
+/*ARGSUSED*/
+static int
+show_usage_date(dladm_usage_t *usage, void *arg)
+{
+
+	time_t	stime;
+	char	timebuf[20];
+
+	stime = usage->du_stime;
+	(void) strftime(timebuf, sizeof (timebuf), "%m/%d/%Y",
+	    localtime(&stime));
+	(void) printf("%s\n", timebuf);
+
+	return (DLADM_STATUS_OK);
+}
+
+static int
+show_usage_time(dladm_usage_t *usage, void *arg)
+{
+	show_usage_state_t	*state = (show_usage_state_t *)arg;
+	char			buf[DLADM_STRSIZE];
+	usage_l_fields_buf_t 	ubuf;
+	time_t			time;
+	double			bw;
+
+	if (state->us_plot) {
+		if (!state->us_printheader) {
+			if (state->us_first) {
+				(void) printf("# Time");
+				state->us_first = B_FALSE;
+			}
+			(void) printf(" %s", usage->du_name);
+			if (usage->du_last) {
+				(void) printf("\n");
+				state->us_first = B_TRUE;
+				state->us_printheader = B_TRUE;
+			}
+		} else {
+			if (state->us_first) {
+				time = usage->du_etime;
+				(void) strftime(buf, sizeof (buf), "%T",
+				    localtime(&time));
+				state->us_first = B_FALSE;
+				(void) printf("%s", buf);
+			}
+			bw = (double)usage->du_bandwidth/1000;
+			(void) printf(" %.2f", bw);
+			if (usage->du_last) {
+				(void) printf("\n");
+				state->us_first = B_TRUE;
+			}
+		}
+		return (DLADM_STATUS_OK);
+	}
+
+	bzero(&ubuf, sizeof (ubuf));
+
+	(void) snprintf(ubuf.usage_l_link, sizeof (ubuf.usage_l_link), "%s",
+	    usage->du_name);
+	time = usage->du_stime;
+	(void) strftime(buf, sizeof (buf), "%T", localtime(&time));
+	(void) snprintf(ubuf.usage_l_stime, sizeof (ubuf.usage_l_stime), "%s",
+	    buf);
+	time = usage->du_etime;
+	(void) strftime(buf, sizeof (buf), "%T", localtime(&time));
+	(void) snprintf(ubuf.usage_l_etime, sizeof (ubuf.usage_l_etime), "%s",
+	    buf);
+	(void) snprintf(ubuf.usage_l_rbytes, sizeof (ubuf.usage_l_rbytes),
+	    "%llu", usage->du_rbytes);
+	(void) snprintf(ubuf.usage_l_obytes, sizeof (ubuf.usage_l_obytes),
+	    "%llu", usage->du_obytes);
+	(void) snprintf(ubuf.usage_l_bandwidth, sizeof (ubuf.usage_l_bandwidth),
+	    "%s Mbps", dladm_bw2str(usage->du_bandwidth, buf));
+
+	if (!state->us_parseable && !state->us_printheader) {
+		print_header(&state->us_print);
+		state->us_printheader = B_TRUE;
+	}
+
+	dladm_print_output(&state->us_print, state->us_parseable,
+	    dladm_print_field, (void *)&ubuf);
+
+	return (DLADM_STATUS_OK);
+}
+
+static int
+show_usage_res(dladm_usage_t *usage, void *arg)
+{
+	show_usage_state_t	*state = (show_usage_state_t *)arg;
+	char			buf[DLADM_STRSIZE];
+	usage_fields_buf_t	ubuf;
+
+	bzero(&ubuf, sizeof (ubuf));
+
+	(void) snprintf(ubuf.usage_link, sizeof (ubuf.usage_link), "%s",
+	    usage->du_name);
+	(void) snprintf(ubuf.usage_duration, sizeof (ubuf.usage_duration),
+	    "%llu", usage->du_duration);
+	(void) snprintf(ubuf.usage_ipackets, sizeof (ubuf.usage_ipackets),
+	    "%llu", usage->du_ipackets);
+	(void) snprintf(ubuf.usage_rbytes, sizeof (ubuf.usage_rbytes),
+	    "%llu", usage->du_rbytes);
+	(void) snprintf(ubuf.usage_opackets, sizeof (ubuf.usage_opackets),
+	    "%llu", usage->du_opackets);
+	(void) snprintf(ubuf.usage_obytes, sizeof (ubuf.usage_obytes),
+	    "%llu", usage->du_obytes);
+	(void) snprintf(ubuf.usage_bandwidth, sizeof (ubuf.usage_bandwidth),
+	    "%s Mbps", dladm_bw2str(usage->du_bandwidth, buf));
+
+	if (!state->us_parseable && !state->us_printheader) {
+		print_header(&state->us_print);
+		state->us_printheader = B_TRUE;
+	}
+
+	dladm_print_output(&state->us_print, state->us_parseable,
+	    dladm_print_field, (void *)&ubuf);
+
+	return (DLADM_STATUS_OK);
+}
+
+static boolean_t
+valid_formatspec(char *formatspec_str)
+{
+	if (strcmp(formatspec_str, "gnuplot") == 0)
+		return (B_TRUE);
+	return (B_FALSE);
+
+}
+
+/*ARGSUSED*/
+static void
+do_show_usage(int argc, char *argv[], const char *use)
+{
+	char			*file = NULL;
+	int			opt;
+	dladm_status_t		status;
+	boolean_t		d_arg = B_FALSE;
+	boolean_t		p_arg = B_FALSE;
+	char			*stime = NULL;
+	char			*etime = NULL;
+	char			*resource = NULL;
+	show_usage_state_t	state;
+	boolean_t		o_arg = B_FALSE;
+	boolean_t		F_arg = B_FALSE;
+	char			*fields_str = NULL;
+	char			*formatspec_str = NULL;
+	print_field_t		**fields;
+	uint_t			nfields;
+	char			*all_fields =
+	    "link,duration,ipackets,rbytes,opackets,obytes,bandwidth";
+	char			*all_l_fields =
+	    "link,start,end,rbytes,obytes,bandwidth";
+
+	bzero(&state, sizeof (show_usage_state_t));
+	state.us_parseable = B_FALSE;
+	state.us_printheader = B_FALSE;
+	state.us_plot = B_FALSE;
+	state.us_first = B_TRUE;
+
+	while ((opt = getopt(argc, argv, "dps:e:o:f:F:")) != -1) {
+		switch (opt) {
+		case 'd':
+			d_arg = B_TRUE;
+			break;
+		case 'p':
+			state.us_plot = p_arg = B_TRUE;
+			break;
+		case 'f':
+			file = optarg;
+			break;
+		case 's':
+			stime = optarg;
+			break;
+		case 'e':
+			etime = optarg;
+			break;
+		case 'o':
+			o_arg = B_TRUE;
+			fields_str = optarg;
+			break;
+		case 'F':
+			F_arg = B_TRUE;
+			formatspec_str = optarg;
+			break;
+		default:
+			die_opterr(optopt, opt, use);
+			break;
+		}
+	}
+
+	if (file == NULL)
+		die("show-usage requires a file");
+
+	if (optind == (argc-1)) {
+		resource = argv[optind];
+	}
+
+	if (resource == NULL && stime == NULL && etime == NULL) {
+		if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0))
+			fields_str = all_fields;
+		fields = parse_output_fields(fields_str, usage_fields,
+		    USAGE_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+	} else {
+		if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0))
+			fields_str = all_l_fields;
+		fields = parse_output_fields(fields_str, usage_l_fields,
+		    USAGE_L_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+	}
+
+	if (fields == NULL) {
+		die("invalid fields(s) specified");
+		return;
+	}
+	state.us_print.ps_fields = fields;
+	state.us_print.ps_nfields = nfields;
+
+	if (p_arg && d_arg)
+		die("plot and date options are incompatible");
+
+	if (p_arg && !F_arg)
+		die("specify format speicifier: -F <format>");
+
+	if (F_arg && valid_formatspec(formatspec_str) == B_FALSE)
+		die("Format specifier %s not supported", formatspec_str);
+
+	if (d_arg) {
+		/* Print log dates */
+		status = dladm_usage_dates(show_usage_date,
+		    DLADM_LOGTYPE_LINK, file, resource, &state);
+	} else if (resource == NULL && stime == NULL && etime == NULL &&
+	    !p_arg) {
+		/* Print summary */
+		status = dladm_usage_summary(show_usage_res,
+		    DLADM_LOGTYPE_LINK, file, &state);
+	} else if (resource != NULL) {
+		/* Print log entries for named resource */
+		status = dladm_walk_usage_res(show_usage_time,
+		    DLADM_LOGTYPE_LINK, file, resource, stime, etime, &state);
+	} else {
+		/* Print time and information for each link */
+		status = dladm_walk_usage_time(show_usage_time,
+		    DLADM_LOGTYPE_LINK, file, stime, etime, &state);
+	}
+
+	if (status != DLADM_STATUS_OK)
+		die_dlerr(status, "show-usage");
+}
+
 static void
 do_create_aggr(int argc, char *argv[], const char *use)
 {
@@ -889,9 +1323,13 @@ do_create_aggr(int argc, char *argv[], const char *use)
 	char			*devs[MAXPORT];
 	char			*links[MAXPORT];
 	dladm_status_t		status;
+	dladm_status_t		pstatus;
+	dladm_arg_list_t	*proplist = NULL;
+	int			i;
+	datalink_id_t		linkid;
 
 	ndev = nlink = opterr = 0;
-	while ((option = getopt_long(argc, argv, ":d:l:L:P:R:tfu:T:",
+	while ((option = getopt_long(argc, argv, ":d:l:L:P:R:tfu:T:p:",
 	    lopts, NULL)) != -1) {
 		switch (option) {
 		case 'd':
@@ -955,6 +1393,11 @@ do_create_aggr(int argc, char *argv[], const char *use)
 		case 'R':
 			altroot = optarg;
 			break;
+		case 'p':
+			if (dladm_parse_link_props(optarg, &proplist, B_FALSE)
+			    != DLADM_STATUS_OK)
+				die("invalid aggregation property");
+			break;
 		default:
 			die_opterr(optopt, option, use);
 			break;
@@ -1000,7 +1443,30 @@ do_create_aggr(int argc, char *argv[], const char *use)
 	status = dladm_aggr_create(name, key, ndev + nlink, port, policy,
 	    mac_addr_fixed, (const uchar_t *)mac_addr, lacp_mode,
 	    lacp_timer, flags);
+	if (status != DLADM_STATUS_OK)
+		goto done;
+
+	if (proplist == NULL)
+		return;
+
+	status = dladm_name2info(name, &linkid, NULL, NULL, NULL);
+	if (status != DLADM_STATUS_OK)
+		goto done;
+
+	for (i = 0; i < proplist->al_count; i++) {
+		dladm_arg_info_t	*aip = &proplist->al_info[i];
+
+		pstatus = dladm_set_linkprop(linkid, aip->ai_name,
+		    aip->ai_val, aip->ai_count, flags);
+
+		if (pstatus != DLADM_STATUS_OK) {
+			die_dlerr(pstatus,
+			    "aggr creation succeeded but "
+			    "could not set property '%s'", aip->ai_name);
+		}
+	}
 done:
+	dladm_free_props(proplist);
 	if (status != DLADM_STATUS_OK) {
 		if (status == DLADM_STATUS_NONOTIF) {
 			die_dlerr(status, "not all links have link up/down "
@@ -1379,19 +1845,21 @@ done:
 static void
 do_create_vlan(int argc, char *argv[], const char *use)
 {
-	char		*link = NULL;
-	char		drv[DLPI_LINKNAME_MAX];
-	uint_t		ppa;
-	datalink_id_t	linkid;
-	int		vid = 0;
-	char		option;
-	uint32_t	flags = (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST);
-	char		*altroot = NULL;
-	char		vlan[MAXLINKNAMELEN];
-	dladm_status_t	status;
+	char			*link = NULL;
+	char			drv[DLPI_LINKNAME_MAX];
+	uint_t			ppa;
+	datalink_id_t		linkid;
+	datalink_id_t		dev_linkid;
+	int			vid = 0;
+	char			option;
+	uint32_t		flags = (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST);
+	char			*altroot = NULL;
+	char			vlan[MAXLINKNAMELEN];
+	dladm_arg_list_t	*proplist = NULL;
+	dladm_status_t		status;
 
 	opterr = 0;
-	while ((option = getopt_long(argc, argv, ":tfl:v:",
+	while ((option = getopt_long(argc, argv, ":tfR:l:v:p:",
 	    lopts, NULL)) != -1) {
 		switch (option) {
 		case 'v':
@@ -1408,15 +1876,21 @@ do_create_vlan(int argc, char *argv[], const char *use)
 
 			link = optarg;
 			break;
-		case 'f':
-			flags |= DLADM_OPT_FORCE;
-			break;
 		case 't':
 			flags &= ~DLADM_OPT_PERSIST;
 			break;
 		case 'R':
 			altroot = optarg;
 			break;
+		case 'p':
+			if (dladm_parse_link_props(optarg, &proplist, B_FALSE)
+			    != DLADM_STATUS_OK) {
+				die("invalid vlan property");
+			}
+			break;
+		case 'f':
+			flags |= DLADM_OPT_FORCE;
+			break;
 		default:
 			die_opterr(optopt, option, use);
 			break;
@@ -1444,19 +1918,14 @@ do_create_vlan(int argc, char *argv[], const char *use)
 	if (altroot != NULL)
 		altroot_cmd(altroot, argc, argv);
 
-	if (dladm_name2info(link, &linkid, NULL, NULL, NULL) !=
+	if (dladm_name2info(link, &dev_linkid, NULL, NULL, NULL) !=
 	    DLADM_STATUS_OK) {
 		die("invalid link name '%s'", link);
 	}
 
-	if ((status = dladm_vlan_create(vlan, linkid, vid, flags)) !=
-	    DLADM_STATUS_OK) {
-		if (status == DLADM_STATUS_NOTSUP) {
-			die_dlerr(status, "VLAN over '%s' may require lowered "
-			    "MTU; must use -f (see dladm(1M))\n", link);
-		} else {
-			die_dlerr(status, "create operation failed");
-		}
+	if ((status = dladm_vlan_create(vlan, dev_linkid, vid, proplist, flags,
+	    &linkid)) != DLADM_STATUS_OK) {
+		die_dlerr(status, "create operation over %s failed", link);
 	}
 }
 
@@ -1505,31 +1974,7 @@ done:
 static void
 do_up_vlan(int argc, char *argv[], const char *use)
 {
-	datalink_id_t	linkid = DATALINK_ALL_LINKID;
-	dladm_status_t	status;
-
-	/*
-	 * get the name of the VLAN (optional last argument)
-	 */
-	if (argc > 2)
-		usage();
-
-	if (argc == 2) {
-		status = dladm_name2info(argv[1], &linkid, NULL, NULL, NULL);
-		if (status != DLADM_STATUS_OK)
-			goto done;
-	}
-
-	status = dladm_vlan_up(linkid);
-done:
-	if (status != DLADM_STATUS_OK) {
-		if (argc == 2) {
-			die_dlerr(status,
-			    "could not bring up VLAN '%s'", argv[1]);
-		} else {
-			die_dlerr(status, "could not bring VLANs up");
-		}
-	}
+	do_up_vnic_common(argc, argv, use, B_TRUE);
 }
 
 static void
@@ -1724,7 +2169,7 @@ print_link_topology(show_state_t *state, datalink_id_t linkid,
 		}
 		free(ginfo.lg_ports);
 	} else if (class == DATALINK_CLASS_VNIC) {
-		dladm_vnic_attr_sys_t	vinfo;
+		dladm_vnic_attr_t	vinfo;
 
 		if ((status = dladm_vnic_info(linkid, &vinfo, flags)) !=
 		    DLADM_STATUS_OK || (status = dladm_datalink_id2info(
@@ -1816,7 +2261,6 @@ done:
 	return (status);
 }
 
-
 static int
 show_link(datalink_id_t linkid, void *arg)
 {
@@ -1854,7 +2298,6 @@ show_link_stats(datalink_id_t linkid, void *arg)
 	show_state_t		*state = (show_state_t *)arg;
 	pktsum_t		stats, diff_stats;
 	dladm_phys_attr_t	dpa;
-	dev_args_t largs;
 
 	if (state->ls_firstonly) {
 		if (state->ls_donefirst)
@@ -1881,12 +2324,15 @@ show_link_stats(datalink_id_t linkid, void *arg)
 	} else {
 		get_link_stats(link, &stats);
 	}
-	stats_diff(&diff_stats, &stats, &state->ls_prevstats);
+	dladm_stats_diff(&diff_stats, &stats, &state->ls_prevstats);
 
-	largs.devs_link = link;
-	largs.devs_psum = &diff_stats;
-	dladm_print_output(&state->ls_print, state->ls_parseable,
-	    print_dev_stats, &largs);
+	(void) printf("%-12s", link);
+	(void) printf("%-10llu", diff_stats.ipackets);
+	(void) printf("%-12llu", diff_stats.rbytes);
+	(void) printf("%-8llu", diff_stats.ierrors);
+	(void) printf("%-10llu", diff_stats.opackets);
+	(void) printf("%-12llu", diff_stats.obytes);
+	(void) printf("%-8llu\n", diff_stats.oerrors);
 
 	state->ls_prevstats = stats;
 	return (DLADM_WALK_CONTINUE);
@@ -2192,7 +2638,7 @@ print_aggr_stats_callback(print_field_t *pf, void *arg)
 			goto err;
 		}
 
-		stats_diff(&diff_stats, &port_stat, l->laggr_prevstats);
+		dladm_stats_diff(&diff_stats, &port_stat, l->laggr_prevstats);
 	}
 
 	switch (pf->pf_index) {
@@ -2296,7 +2742,8 @@ print_aggr_stats(show_grp_state_t *state, const char *link,
 		}
 
 		get_mac_stats(dpa.dp_dev, &port_stat);
-		stats_total(&pktsumtot, &port_stat, &state->gs_prevstats[i]);
+		dladm_stats_total(&pktsumtot, &port_stat,
+		    &state->gs_prevstats[i]);
 	}
 
 	if (!state->gs_parseable && !state->gs_printheader) {
@@ -2381,127 +2828,17 @@ done:
 	return (DLADM_WALK_CONTINUE);
 }
 
-static char *
-print_dev(print_field_t *pf, void *arg)
-{
-	const char *dev = arg;
-	static char buf[DLADM_STRSIZE];
-
-	switch (pf->pf_index) {
-	case DEV_LINK:
-		(void) snprintf(buf, sizeof (buf), "%s", dev);
-		break;
-	case DEV_STATE:
-		(void) get_linkstate(dev, B_FALSE, buf);
-		break;
-	case DEV_SPEED:
-		(void) snprintf(buf, sizeof (buf), "%uMb",
-		    (unsigned int)(get_ifspeed(dev, B_FALSE) / 1000000ull));
-		break;
-	case DEV_DUPLEX:
-		(void) get_linkduplex(dev, B_FALSE, buf);
-		break;
-	default:
-		die("invalid index '%d'", pf->pf_index);
-		break;
-	}
-	return (buf);
-}
-
-static int
-show_dev(const char *dev, void *arg)
-{
-	show_state_t	*state = arg;
-
-	if (!state->ls_parseable && !state->ls_printheader) {
-		print_header(&state->ls_print);
-		state->ls_printheader = B_TRUE;
-	}
-
-	dladm_print_output(&state->ls_print, state->ls_parseable,
-	    print_dev, (void *)dev);
-
-	return (DLADM_WALK_CONTINUE);
-}
-
-static char *
-print_dev_stats(print_field_t *pf, void *arg)
-{
-	dev_args_t *dargs = arg;
-	pktsum_t *diff_stats = dargs->devs_psum;
-	static char buf[DLADM_STRSIZE];
-
-	switch (pf->pf_index) {
-	case DEVS_LINK:
-		(void) snprintf(buf, sizeof (buf), "%s", dargs->devs_link);
-		break;
-	case DEVS_IPKTS:
-		(void) snprintf(buf, sizeof (buf), "%llu",
-		    diff_stats->ipackets);
-		break;
-	case DEVS_RBYTES:
-		(void) snprintf(buf, sizeof (buf), "%llu",
-		    diff_stats->rbytes);
-		break;
-	case DEVS_IERRORS:
-		(void) snprintf(buf, sizeof (buf), "%u",
-		    diff_stats->ierrors);
-		break;
-	case DEVS_OPKTS:
-		(void) snprintf(buf, sizeof (buf), "%llu",
-		    diff_stats->opackets);
-		break;
-	case DEVS_OBYTES:
-		(void) snprintf(buf, sizeof (buf), "%llu",
-		    diff_stats->obytes);
-		break;
-	case DEVS_OERRORS:
-		(void) snprintf(buf, sizeof (buf), "%u",
-		    diff_stats->oerrors);
-		break;
-	default:
-		die("invalid input");
-		break;
-	}
-	return (buf);
-}
-
-static int
-show_dev_stats(const char *dev, void *arg)
-{
-	show_state_t *state = arg;
-	pktsum_t stats, diff_stats;
-	dev_args_t dargs;
-
-	if (state->ls_firstonly) {
-		if (state->ls_donefirst)
-			return (DLADM_WALK_CONTINUE);
-		state->ls_donefirst = B_TRUE;
-	} else {
-		bzero(&state->ls_prevstats, sizeof (state->ls_prevstats));
-	}
-
-	get_mac_stats(dev, &stats);
-	stats_diff(&diff_stats, &stats, &state->ls_prevstats);
-
-	dargs.devs_link = (char *)dev;
-	dargs.devs_psum = &diff_stats;
-	dladm_print_output(&state->ls_print, state->ls_parseable,
-	    print_dev_stats, &dargs);
-
-	state->ls_prevstats = stats;
-	return (DLADM_WALK_CONTINUE);
-}
-
 static void
 do_show_link(int argc, char *argv[], const char *use)
 {
 	int		option;
 	boolean_t	s_arg = B_FALSE;
+	boolean_t	S_arg = B_FALSE;
 	boolean_t	i_arg = B_FALSE;
 	uint32_t	flags = DLADM_OPT_ACTIVE;
 	boolean_t	p_arg = B_FALSE;
 	datalink_id_t	linkid = DATALINK_ALL_LINKID;
+	char		linkname[MAXLINKNAMELEN];
 	int		interval = 0;
 	show_state_t	state;
 	dladm_status_t	status;
@@ -2517,7 +2854,7 @@ do_show_link(int argc, char *argv[], const char *use)
 	bzero(&state, sizeof (state));
 
 	opterr = 0;
-	while ((option = getopt_long(argc, argv, ":pPsi:o:",
+	while ((option = getopt_long(argc, argv, ":pPsSi:o:",
 	    show_lopts, NULL)) != -1) {
 		switch (option) {
 		case 'p':
@@ -2538,6 +2875,12 @@ do_show_link(int argc, char *argv[], const char *use)
 
 			flags = DLADM_OPT_PERSIST;
 			break;
+		case 'S':
+			if (S_arg)
+				die_optdup(option);
+
+			S_arg = B_TRUE;
+			break;
 		case 'o':
 			o_arg = B_TRUE;
 			fields_str = optarg;
@@ -2556,19 +2899,32 @@ do_show_link(int argc, char *argv[], const char *use)
 		}
 	}
 
-	if (i_arg && !s_arg)
-		die("the option -i can be used only with -s");
+	if (i_arg && !(s_arg || S_arg))
+		die("the option -i can be used only with -s or -S");
+
+	if (s_arg && S_arg)
+		die("the -s option cannot be used with -S");
 
 	if (s_arg && flags != DLADM_OPT_ACTIVE)
 		die("the option -P cannot be used with -s");
 
+	if (S_arg && (p_arg || flags != DLADM_OPT_ACTIVE))
+		die("the option -%c cannot be used with -S", p_arg ? 'p' : 'P');
+
 	/* get link name (optional last argument) */
 	if (optind == (argc-1)) {
 		uint32_t	f;
 
-		if ((status = dladm_name2info(argv[optind], &linkid, &f,
+		if (strlcpy(linkname, argv[optind], MAXLINKNAMELEN)
+		    >= MAXLINKNAMELEN) {
+			(void) fprintf(stderr,
+			    gettext("%s: link name too long\n"),
+			    progname);
+			exit(1);
+		}
+		if ((status = dladm_name2info(linkname, &linkid, &f,
 		    NULL, NULL)) != DLADM_STATUS_OK) {
-			die_dlerr(status, "link %s is not valid", argv[optind]);
+			die_dlerr(status, "link %s is not valid", linkname);
 		}
 
 		if (!(f & flags)) {
@@ -2583,6 +2939,11 @@ do_show_link(int argc, char *argv[], const char *use)
 	if (p_arg && !o_arg)
 		die("-p requires -o");
 
+	if (S_arg) {
+		dladm_continuous(linkid, NULL, interval, LINK_REPORT);
+		return;
+	}
+
 	if (p_arg && strcasecmp(fields_str, "all") == 0)
 		die("\"-o all\" is invalid with -p");
 
@@ -2604,7 +2965,6 @@ do_show_link(int argc, char *argv[], const char *use)
 		return;
 	}
 
-
 	fields = parse_output_fields(fields_str, link_fields, DEV_LINK_FIELDS,
 	    CMD_TYPE_ANY, &nfields);
 
@@ -2641,17 +3001,17 @@ do_show_aggr(int argc, char *argv[], const char *use)
 	int			interval = 0;
 	int			key;
 	dladm_status_t		status;
-	boolean_t	o_arg = B_FALSE;
-	char		*fields_str = NULL;
-	print_field_t   **fields;
-	uint_t		nfields;
-	char		*all_fields =
+	boolean_t		o_arg = B_FALSE;
+	char			*fields_str = NULL;
+	print_field_t		**fields;
+	uint_t			nfields;
+	char			*all_fields =
 	    "link,policy,addrpolicy,lacpactivity,lacptimer,flags";
-	char		*all_lacp_fields =
+	char			*all_lacp_fields =
 	    "link,port,aggregatable,sync,coll,dist,defaulted,expired";
-	char		*all_stats_fields =
+	char			*all_stats_fields =
 	    "link,port,ipackets,rbytes,opackets,obytes,ipktdist,opktdist";
-	char		*all_extended_fields =
+	char			*all_extended_fields =
 	    "link,port,speed,duplex,state,address,portstate";
 	print_field_t		*pf;
 	int			pfmax;
@@ -2806,138 +3166,222 @@ do_show_aggr(int argc, char *argv[], const char *use)
 	}
 }
 
-static void
-do_show_dev(int argc, char *argv[], const char *use)
+static dladm_status_t
+print_phys_default(show_state_t *state, datalink_id_t linkid,
+    const char *link, uint32_t flags, uint32_t media)
 {
-	int		option;
-	char		*dev = NULL;
-	boolean_t	s_arg = B_FALSE;
-	boolean_t	i_arg = B_FALSE;
-	boolean_t	o_arg = B_FALSE;
-	boolean_t	p_arg = B_FALSE;
-	datalink_id_t	linkid;
-	int		interval = 0;
-	show_state_t	state;
-	char		*fields_str = NULL;
-	print_field_t	**fields;
-	uint_t		nfields;
-	char		*all_fields = "link,state,speed,duplex";
-	static char	*allstat_fields =
-	    "link,ipackets,rbytes,ierrors,opackets,obytes,oerrors";
+	dladm_phys_attr_t dpa;
+	dladm_status_t status;
+	link_fields_buf_t pattr;
 
-	bzero(&state, sizeof (state));
-	fields_str = all_fields;
+	status = dladm_phys_info(linkid, &dpa, state->ls_flags);
+	if (status != DLADM_STATUS_OK)
+		goto done;
 
-	opterr = 0;
-	while ((option = getopt_long(argc, argv, ":psi:o:",
-	    show_lopts, NULL)) != -1) {
-		switch (option) {
-		case 'p':
-			if (p_arg)
-				die_optdup(option);
+	(void) snprintf(pattr.link_phys_device,
+	    sizeof (pattr.link_phys_device), "%s", dpa.dp_dev);
+	(void) dladm_media2str(media, pattr.link_phys_media);
+	if (state->ls_flags == DLADM_OPT_ACTIVE) {
+		boolean_t	islink;
 
-			p_arg = B_TRUE;
-			break;
-		case 's':
-			if (s_arg)
-				die_optdup(option);
+		if (!dpa.dp_novanity) {
+			(void) strlcpy(pattr.link_name, link,
+			    sizeof (pattr.link_name));
+			islink = B_TRUE;
+		} else {
+			/*
+			 * This is a physical link that does not have
+			 * vanity naming support.
+			 */
+			(void) strlcpy(pattr.link_name, dpa.dp_dev,
+			    sizeof (pattr.link_name));
+			islink = B_FALSE;
+		}
 
-			s_arg = B_TRUE;
-			break;
-		case 'o':
-			o_arg = B_TRUE;
-			fields_str = optarg;
-			break;
-		case 'i':
-			if (i_arg)
-				die_optdup(option);
+		(void) get_linkstate(pattr.link_name, islink,
+		    pattr.link_phys_state);
+		(void) snprintf(pattr.link_phys_speed,
+		    sizeof (pattr.link_phys_speed), "%u",
+		    (uint_t)((get_ifspeed(pattr.link_name,
+		    islink)) / 1000000ull));
+		(void) get_linkduplex(pattr.link_name, islink,
+		    pattr.link_phys_duplex);
+	} else {
+		(void) snprintf(pattr.link_name, sizeof (pattr.link_name),
+		    "%s", link);
+		(void) snprintf(pattr.link_flags, sizeof (pattr.link_flags),
+		    "%c----", flags & DLADM_OPT_ACTIVE ? '-' : 'r');
+	}
 
-			i_arg = B_TRUE;
-			if (!str2int(optarg, &interval) || interval == 0)
-				die("invalid interval value '%s'", optarg);
-			break;
-		default:
-			die_opterr(optopt, option, use);
-			break;
-		}
+	if (!state->ls_parseable && !state->ls_printheader) {
+		print_header(&state->ls_print);
+		state->ls_printheader = B_TRUE;
 	}
 
-	if (p_arg && !o_arg)
-		die("-p requires -o");
+	dladm_print_output(&state->ls_print, state->ls_parseable,
+	    dladm_print_field, (void *)&pattr);
 
-	if (p_arg && strcasecmp(fields_str, "all") == 0)
-		die("\"-o all\" is invalid with -p");
+done:
+	return (status);
+}
 
-	if (i_arg && !s_arg)
-		die("the option -i can be used only with -s");
+typedef struct {
+	show_state_t	*ms_state;
+	char		*ms_link;
+	dladm_macaddr_attr_t *ms_mac_attr;
+} print_phys_mac_state_t;
 
-	if (o_arg && strcasecmp(fields_str, "all") == 0) {
-		if (!s_arg)
-			fields_str = all_fields;
+/* callback of dladm_print_output() */
+static char *
+print_phys_one_mac_callback(print_field_t *pf, void *arg)
+{
+	print_phys_mac_state_t *mac_state = arg;
+	dladm_macaddr_attr_t *attr = mac_state->ms_mac_attr;
+	static char buf[DLADM_STRSIZE];
+	boolean_t is_primary = (attr->ma_slot == 0);
+	boolean_t is_parseable = mac_state->ms_state->ls_parseable;
+
+	switch (pf->pf_index) {
+	case PHYS_M_LINK:
+		(void) snprintf(buf, sizeof (buf), "%s",
+		    (is_primary || is_parseable) ? mac_state->ms_link : " ");
+		break;
+	case PHYS_M_SLOT:
+		if (is_primary)
+			(void) snprintf(buf, sizeof (buf), gettext("primary"));
 		else
-			fields_str = allstat_fields;
+			(void) snprintf(buf, sizeof (buf), "%d", attr->ma_slot);
+		break;
+	case PHYS_M_ADDRESS:
+		(void) dladm_aggr_macaddr2str(attr->ma_addr, buf);
+		break;
+	case PHYS_M_INUSE:
+		(void) snprintf(buf, sizeof (buf), "%s",
+		    attr->ma_flags & DLADM_MACADDR_USED ? gettext("yes") :
+		    gettext("no"));
+		break;
+	case PHYS_M_CLIENT:
+		/*
+		 * CR 6678526: resolve link id to actual link name if
+		 * it is valid.
+		 */
+		(void) snprintf(buf, sizeof (buf), "%s", attr->ma_client_name);
+		break;
 	}
 
-	if (!o_arg && s_arg)
-		fields_str = allstat_fields;
-
-	if (s_arg && p_arg)
-		die("the option -s cannot be used with -p");
-
-	/* get dev name (optional last argument) */
-	if (optind == (argc-1)) {
-		uint32_t flags;
+	return (buf);
+}
 
-		dev = argv[optind];
+typedef struct {
+	show_state_t	*hs_state;
+	char		*hs_link;
+	dladm_hwgrp_attr_t *hs_grp_attr;
+} print_phys_hwgrp_state_t;
 
-		if (dladm_dev2linkid(dev, &linkid) != DLADM_STATUS_OK)
-			die("invalid device %s", dev);
+static char *
+print_phys_one_hwgrp_callback(print_field_t *pf, void *arg)
+{
+	print_phys_hwgrp_state_t *hg_state = arg;
+	dladm_hwgrp_attr_t *attr = hg_state->hs_grp_attr;
+	static char buf[DLADM_STRSIZE];
 
-		if ((dladm_datalink_id2info(linkid, &flags, NULL, NULL,
-		    NULL, 0) != DLADM_STATUS_OK) ||
-		    !(flags & DLADM_OPT_ACTIVE)) {
-			die("device %s has been removed", dev);
+	switch (pf->pf_index) {
+	case PHYS_H_LINK:
+		(void) snprintf(buf, sizeof (buf), "%s", attr->hg_link_name);
+		break;
+	case PHYS_H_GROUP:
+		(void) snprintf(buf, sizeof (buf), "%d", attr->hg_grp_num);
+		break;
+	case PHYS_H_GRPTYPE:
+		(void) snprintf(buf, sizeof (buf), "%s",
+		    attr->hg_grp_type == DLADM_HWGRP_TYPE_RX ? "RX" : "TX");
+		break;
+	case PHYS_H_RINGS:
+		(void) snprintf(buf, sizeof (buf), "%d", attr->hg_n_rings);
+		break;
+	case PHYS_H_CLIENTS:
+		if (attr->hg_client_names[0] == '\0') {
+			(void) snprintf(buf, sizeof (buf), "--");
+		} else {
+			(void) snprintf(buf, sizeof (buf), "%s ",
+			    attr->hg_client_names);
 		}
-	} else if (optind != argc) {
-		usage();
+		break;
 	}
 
-	state.ls_parseable = p_arg;
-	state.ls_donefirst = B_FALSE;
+	return (buf);
+}
 
-	if (s_arg) {
-		dev_stats(dev, interval, fields_str, &state);
-		return;
+/* callback of dladm_walk_macaddr, invoked for each MAC address slot */
+static boolean_t
+print_phys_mac_callback(void *arg, dladm_macaddr_attr_t *attr)
+{
+	print_phys_mac_state_t *mac_state = arg;
+	show_state_t *state = mac_state->ms_state;
+
+	if (!state->ls_parseable && !state->ls_printheader) {
+		print_header(&state->ls_print);
+		state->ls_printheader = B_TRUE;
 	}
 
-	fields = parse_output_fields(fields_str, dev_fields, DEV_MAX_FIELDS,
-	    CMD_TYPE_ANY, &nfields);
+	mac_state->ms_mac_attr = attr;
+	dladm_print_output(&state->ls_print, state->ls_parseable,
+	    print_phys_one_mac_callback, mac_state);
 
-	if (fields == NULL) {
-		die("invalid field(s) specified");
-		return;
-	}
+	return (B_TRUE);
+}
 
-	state.ls_print.ps_fields = fields;
-	state.ls_print.ps_nfields = nfields;
+/* invoked by show-phys -m for each physical data-link */
+static dladm_status_t
+print_phys_mac(show_state_t *state, datalink_id_t linkid, char *link)
+{
+	print_phys_mac_state_t mac_state;
 
-	if (dev == NULL) {
-		(void) dladm_mac_walk(show_dev, &state);
-	} else {
-		(void) show_dev(dev, &state);
+	mac_state.ms_state = state;
+	mac_state.ms_link = link;
+
+	return (dladm_walk_macaddr(linkid, &mac_state,
+	    print_phys_mac_callback));
+}
+
+/* callback of dladm_walk_hwgrp, invoked for each MAC hwgrp */
+static boolean_t
+print_phys_hwgrp_callback(void *arg, dladm_hwgrp_attr_t *attr)
+{
+	print_phys_hwgrp_state_t *hwgrp_state = arg;
+	show_state_t *state = hwgrp_state->hs_state;
+
+	if (!state->ls_parseable && !state->ls_printheader) {
+		print_header(&state->ls_print);
+		state->ls_printheader = B_TRUE;
 	}
+	hwgrp_state->hs_grp_attr = attr;
+	dladm_print_output(&state->ls_print, state->ls_parseable,
+	    print_phys_one_hwgrp_callback, hwgrp_state);
+
+	return (B_TRUE);
 }
 
+/* invoked by show-phys -H for each physical data-link */
+static dladm_status_t
+print_phys_hwgrp(show_state_t *state, datalink_id_t linkid, char *link)
+{
+	print_phys_hwgrp_state_t hwgrp_state;
+
+	hwgrp_state.hs_state = state;
+	hwgrp_state.hs_link = link;
+	return (dladm_walk_hwgrp(linkid, &hwgrp_state,
+	    print_phys_hwgrp_callback));
+}
 
 static dladm_status_t
-print_phys(show_state_t *state, datalink_id_t linkid, link_fields_buf_t *pattr)
+print_phys(show_state_t *state, datalink_id_t linkid)
 {
 	char			link[MAXLINKNAMELEN];
-	dladm_phys_attr_t	dpa;
 	uint32_t		flags;
+	dladm_status_t		status;
 	datalink_class_t	class;
 	uint32_t		media;
-	dladm_status_t		status;
 
 	if ((status = dladm_datalink_id2info(linkid, &flags, &class, &media,
 	    link, MAXLINKNAMELEN)) != DLADM_STATUS_OK) {
@@ -2954,44 +3398,12 @@ print_phys(show_state_t *state, datalink_id_t linkid, link_fields_buf_t *pattr)
 		goto done;
 	}
 
-	status = dladm_phys_info(linkid, &dpa, state->ls_flags);
-	if (status != DLADM_STATUS_OK)
-		goto done;
-
-	(void) snprintf(pattr->link_phys_device,
-	    sizeof (pattr->link_phys_device), "%s", dpa.dp_dev);
-	(void) dladm_media2str(media, pattr->link_phys_media);
-	if (state->ls_flags == DLADM_OPT_ACTIVE) {
-		boolean_t	islink;
-
-		if (!dpa.dp_novanity) {
-			(void) strlcpy(pattr->link_name, link,
-			    sizeof (pattr->link_name));
-			islink = B_TRUE;
-		} else {
-			/*
-			 * This is a physical link that does not have
-			 * vanity naming support.
-			 */
-			(void) strlcpy(pattr->link_name, dpa.dp_dev,
-			    sizeof (pattr->link_name));
-			islink = B_FALSE;
-		}
-
-		(void) get_linkstate(pattr->link_name, islink,
-		    pattr->link_phys_state);
-		(void) snprintf(pattr->link_phys_speed,
-		    sizeof (pattr->link_phys_speed), "%u",
-		    (uint_t)((get_ifspeed(pattr->link_name,
-		    islink)) / 1000000ull));
-		(void) get_linkduplex(pattr->link_name, islink,
-		    pattr->link_phys_duplex);
-	} else {
-		(void) snprintf(pattr->link_name, sizeof (pattr->link_name),
-		    "%s", link);
-		(void) snprintf(pattr->link_flags, sizeof (pattr->link_flags),
-		    "%c----", flags & DLADM_OPT_ACTIVE ? '-' : 'r');
-	}
+	if (state->ls_mac)
+		status = print_phys_mac(state, linkid, link);
+	else if (state->ls_hwgrp)
+		status = print_phys_hwgrp(state, linkid, link);
+	else
+		status = print_phys_default(state, linkid, link, flags, media);
 
 done:
 	return (status);
@@ -3000,29 +3412,12 @@ done:
 static int
 show_phys(datalink_id_t linkid, void *arg)
 {
-	show_state_t		*state = arg;
-	dladm_status_t		status;
-	link_fields_buf_t	pattr;
-
-	bzero(&pattr, sizeof (link_fields_buf_t));
-	status = print_phys(state, linkid, &pattr);
-	if (status != DLADM_STATUS_OK)
-		goto done;
-
-	if (!state->ls_parseable && !state->ls_printheader) {
-		print_header(&state->ls_print);
-		state->ls_printheader = B_TRUE;
-	}
-
-	dladm_print_output(&state->ls_print, state->ls_parseable,
-	    dladm_print_field, (void *)&pattr);
+	show_state_t	*state = arg;
 
-done:
-	state->ls_status = status;
+	state->ls_status = print_phys(state, linkid);
 	return (DLADM_WALK_CONTINUE);
 }
 
-
 /*
  * Print the active topology information.
  */
@@ -3052,8 +3447,8 @@ print_vlan(show_state_t *state, datalink_id_t linkid, link_fields_buf_t *l)
 
 	(void) snprintf(l->link_vlan_vid, sizeof (l->link_vlan_vid), "%d",
 	    vinfo.dv_vid);
-	(void) snprintf(l->link_flags, sizeof (l->link_flags), "%c%c---",
-	    vinfo.dv_force ? 'f' : '-', vinfo.dv_implicit ? 'i' : '-');
+	(void) snprintf(l->link_flags, sizeof (l->link_flags), "%c----",
+	    vinfo.dv_force ? 'f' : '-');
 
 done:
 	return (status);
@@ -3091,6 +3486,8 @@ do_show_phys(int argc, char *argv[], const char *use)
 	uint32_t	flags = DLADM_OPT_ACTIVE;
 	boolean_t	p_arg = B_FALSE;
 	boolean_t	o_arg = B_FALSE;
+	boolean_t	m_arg = B_FALSE;
+	boolean_t	H_arg = B_FALSE;
 	datalink_id_t	linkid = DATALINK_ALL_LINKID;
 	show_state_t	state;
 	dladm_status_t	status;
@@ -3100,10 +3497,15 @@ do_show_phys(int argc, char *argv[], const char *use)
 	char		*all_active_fields =
 	    "link,media,state,speed,duplex,device";
 	char		*all_inactive_fields = "link,device,media,flags";
+	char		*all_mac_fields = "link,slot,address,inuse,client";
+	char		*all_hwgrp_fields =
+	    "link,group,grouptype,rings,clients";
+	print_field_t	*pf;
+	int		pfmax;
 
 	bzero(&state, sizeof (state));
 	opterr = 0;
-	while ((option = getopt_long(argc, argv, ":pPo:",
+	while ((option = getopt_long(argc, argv, ":pPo:mH",
 	    show_lopts, NULL)) != -1) {
 		switch (option) {
 		case 'p':
@@ -3122,6 +3524,12 @@ do_show_phys(int argc, char *argv[], const char *use)
 			o_arg = B_TRUE;
 			fields_str = optarg;
 			break;
+		case 'm':
+			m_arg = B_TRUE;
+			break;
+		case 'H':
+			H_arg = B_TRUE;
+			break;
 		default:
 			die_opterr(optopt, option, use);
 			break;
@@ -3131,6 +3539,9 @@ do_show_phys(int argc, char *argv[], const char *use)
 	if (p_arg && !o_arg)
 		die("-p requires -o");
 
+	if (m_arg && H_arg)
+		die("-m cannot combine with -H");
+
 	if (p_arg && strcasecmp(fields_str, "all") == 0)
 		die("\"-o all\" is invalid with -p");
 
@@ -3147,16 +3558,42 @@ do_show_phys(int argc, char *argv[], const char *use)
 	state.ls_parseable = p_arg;
 	state.ls_flags = flags;
 	state.ls_donefirst = B_FALSE;
+	state.ls_mac = m_arg;
+	state.ls_hwgrp = H_arg;
+
+	if (m_arg && !(flags & DLADM_OPT_ACTIVE)) {
+		/*
+		 * We can only display the factory MAC addresses of
+		 * active data-links.
+		 */
+		die("-m not compatible with -P");
+	}
 
 	if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) {
-		if (state.ls_flags & DLADM_OPT_ACTIVE)
+		if (state.ls_mac)
+			fields_str = all_mac_fields;
+		else if (state.ls_hwgrp)
+			fields_str = all_hwgrp_fields;
+		else if (state.ls_flags & DLADM_OPT_ACTIVE) {
 			fields_str = all_active_fields;
-		else
+		} else {
 			fields_str = all_inactive_fields;
+		}
+	}
+
+	if (state.ls_mac) {
+		pf = phys_m_fields;
+		pfmax = PHYS_M_MAX_FIELDS;
+	} else if (state.ls_hwgrp) {
+		pf = phys_h_fields;
+		pfmax = PHYS_H_MAX_FIELDS;
+	} else {
+		pf = phys_fields;
+		pfmax = PHYS_MAX_FIELDS;
 	}
 
-	fields = parse_output_fields(fields_str, phys_fields,
-	    PHYS_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+	fields = parse_output_fields(fields_str, pf,
+	    pfmax, CMD_TYPE_ANY, &nfields);
 
 	if (fields == NULL) {
 		die("invalid field(s) specified");
@@ -3267,6 +3704,661 @@ do_show_vlan(int argc, char *argv[], const char *use)
 }
 
 static void
+do_create_vnic(int argc, char *argv[], const char *use)
+{
+	datalink_id_t		linkid, dev_linkid;
+	char			devname[MAXLINKNAMELEN];
+	char			name[MAXLINKNAMELEN];
+	boolean_t		l_arg = B_FALSE;
+	uint32_t		flags = DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST;
+	char			*altroot = NULL;
+	char			option;
+	char			*endp = NULL;
+	dladm_status_t		status;
+	vnic_mac_addr_type_t	mac_addr_type = VNIC_MAC_ADDR_TYPE_AUTO;
+	uchar_t			*mac_addr;
+	int			mac_slot = -1, maclen = 0, mac_prefix_len = 0;
+	dladm_arg_list_t	*proplist = NULL;
+	uint16_t		vid = 0;
+
+	opterr = 0;
+	while ((option = getopt_long(argc, argv, ":tfR:l:m:n:p:r:v:H",
+	    vnic_lopts, NULL)) != -1) {
+		switch (option) {
+		case 't':
+			flags &= ~DLADM_OPT_PERSIST;
+			break;
+		case 'R':
+			altroot = optarg;
+			break;
+		case 'l':
+			if (strlcpy(devname, optarg, MAXLINKNAMELEN) >=
+			    MAXLINKNAMELEN)
+				die("link name too long");
+			l_arg = B_TRUE;
+			break;
+		case 'm':
+			if (strcmp(optarg, "fixed") == 0) {
+				/*
+				 * A fixed MAC address must be specified
+				 * by its value, not by the keyword 'fixed'.
+				 */
+				die("'fixed' is not a valid MAC address");
+			}
+			if (dladm_vnic_str2macaddrtype(optarg,
+			    &mac_addr_type) != DLADM_STATUS_OK) {
+				mac_addr_type = VNIC_MAC_ADDR_TYPE_FIXED;
+				/* MAC address specified by value */
+				mac_addr = _link_aton(optarg, &maclen);
+				if (mac_addr == NULL) {
+					if (maclen == -1)
+						die("invalid MAC address");
+					else
+						die("out of memory");
+					exit(1);
+				}
+			}
+			break;
+		case 'n':
+			errno = 0;
+			mac_slot = (int)strtol(optarg, &endp, 10);
+			if (errno != 0 || *endp != '\0')
+				die("invalid slot number");
+			break;
+		case 'p':
+			if (dladm_parse_link_props(optarg, &proplist, B_FALSE)
+			    != DLADM_STATUS_OK)
+				die("invalid vnic property");
+			break;
+		case 'r':
+			mac_addr = _link_aton(optarg, &mac_prefix_len);
+			if (mac_addr == NULL) {
+				if (mac_prefix_len == -1)
+					die("invalid MAC address");
+				else
+					die("out of memory");
+				exit(1);
+			}
+			break;
+		case 'v':
+			vid = (int)strtol(optarg, &endp, 10);
+			if (errno != 0 || *endp != '\0' || vid == 0)
+				/* VID of 0 is invalid */
+				die("invalid VLAN id");
+			break;
+		case 'f':
+			flags |= DLADM_OPT_FORCE;
+			break;
+		case 'H':
+			flags |= DLADM_OPT_HWRINGS;
+			break;
+		default:
+			die_opterr(optopt, option, use);
+		}
+	}
+
+	/*
+	 * 'f' - force, flag can be specified only with 'v' - vlan.
+	 */
+	if ((flags & DLADM_OPT_FORCE) != 0 && vid == 0)
+		die("-f option can only be used with -v");
+
+	if (mac_prefix_len != 0 && mac_addr_type != VNIC_MAC_ADDR_TYPE_RANDOM &&
+	    mac_addr_type != VNIC_MAC_ADDR_TYPE_FIXED)
+		usage();
+
+	/* check required options */
+	if (!l_arg)
+		usage();
+
+	if (mac_slot != -1 && mac_addr_type != VNIC_MAC_ADDR_TYPE_FACTORY)
+		usage();
+
+	/* the VNIC id is the required operand */
+	if (optind != (argc - 1))
+		usage();
+
+	if (strlcpy(name, argv[optind], MAXLINKNAMELEN) >= MAXLINKNAMELEN)
+		die("link name too long '%s'", argv[optind]);
+
+	if (!dladm_valid_linkname(name))
+		die("invalid link name '%s'", argv[optind]);
+
+	if (altroot != NULL)
+		altroot_cmd(altroot, argc, argv);
+
+	if (dladm_name2info(devname, &dev_linkid, NULL, NULL, NULL) !=
+	    DLADM_STATUS_OK)
+		die("invalid link name '%s'", devname);
+
+	status = dladm_vnic_create(name, dev_linkid, mac_addr_type, mac_addr,
+	    maclen, &mac_slot, mac_prefix_len, vid, &linkid, proplist, flags);
+	if (status != DLADM_STATUS_OK)
+		die_dlerr(status, "vnic creation over %s failed", devname);
+
+	dladm_free_props(proplist);
+}
+
+static void
+do_etherstub_check(const char *name, datalink_id_t linkid, boolean_t etherstub,
+    uint32_t flags)
+{
+	boolean_t is_etherstub;
+	dladm_vnic_attr_t attr;
+
+	if (dladm_vnic_info(linkid, &attr, flags) != DLADM_STATUS_OK) {
+		/*
+		 * Let the delete continue anyway.
+		 */
+		return;
+	}
+	is_etherstub = (attr.va_link_id == DATALINK_INVALID_LINKID);
+	if (is_etherstub != etherstub) {
+		die("'%s' is not %s", name,
+		    (is_etherstub ? "a vnic" : "an etherstub"));
+	}
+}
+
+static void
+do_delete_vnic_common(int argc, char *argv[], const char *use,
+    boolean_t etherstub)
+{
+	char option;
+	uint32_t flags = DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST;
+	datalink_id_t linkid;
+	char *altroot = NULL;
+	dladm_status_t status;
+
+	opterr = 0;
+	while ((option = getopt_long(argc, argv, ":R:t", lopts,
+	    NULL)) != -1) {
+		switch (option) {
+		case 't':
+			flags &= ~DLADM_OPT_PERSIST;
+			break;
+		case 'R':
+			altroot = optarg;
+			break;
+		default:
+			die_opterr(optopt, option, use);
+		}
+	}
+
+	/* get vnic name (required last argument) */
+	if (optind != (argc - 1))
+		usage();
+
+	if (altroot != NULL)
+		altroot_cmd(altroot, argc, argv);
+
+	status = dladm_name2info(argv[optind], &linkid, NULL, NULL, NULL);
+	if (status != DLADM_STATUS_OK)
+		die("invalid link name '%s'", argv[optind]);
+
+	if ((flags & DLADM_OPT_ACTIVE) != 0) {
+		do_etherstub_check(argv[optind], linkid, etherstub,
+		    DLADM_OPT_ACTIVE);
+	}
+	if ((flags & DLADM_OPT_PERSIST) != 0) {
+		do_etherstub_check(argv[optind], linkid, etherstub,
+		    DLADM_OPT_PERSIST);
+	}
+
+	status = dladm_vnic_delete(linkid, flags);
+	if (status != DLADM_STATUS_OK)
+		die_dlerr(status, "vnic deletion failed");
+}
+
+static void
+do_delete_vnic(int argc, char *argv[], const char *use)
+{
+	do_delete_vnic_common(argc, argv, use, B_FALSE);
+}
+
+/* ARGSUSED */
+static void
+do_up_vnic_common(int argc, char *argv[], const char *use, boolean_t vlan)
+{
+	datalink_id_t	linkid = DATALINK_ALL_LINKID;
+	dladm_status_t	status;
+	char 		*type;
+
+	type = vlan ? "vlan" : "vnic";
+
+	/*
+	 * get the id or the name of the vnic/vlan (optional last argument)
+	 */
+	if (argc == 2) {
+		status = dladm_name2info(argv[1], &linkid, NULL, NULL, NULL);
+		if (status != DLADM_STATUS_OK)
+			goto done;
+
+	} else if (argc > 2) {
+		usage();
+	}
+
+	if (vlan)
+		status = dladm_vlan_up(linkid);
+	else
+		status = dladm_vnic_up(linkid, 0);
+
+done:
+	if (status != DLADM_STATUS_OK) {
+		if (argc == 2) {
+			die_dlerr(status,
+			    "could not bring up %s '%s'", type, argv[1]);
+		} else {
+			die_dlerr(status, "could not bring %ss up", type);
+		}
+	}
+}
+
+static void
+do_up_vnic(int argc, char *argv[], const char *use)
+{
+	do_up_vnic_common(argc, argv, use, B_FALSE);
+}
+
+static void
+dump_vnics_head(const char *dev)
+{
+	if (strlen(dev))
+		(void) printf("%s", dev);
+
+	(void) printf("\tipackets  rbytes      opackets  obytes          ");
+
+	if (strlen(dev))
+		(void) printf("%%ipkts  %%opkts\n");
+	else
+		(void) printf("\n");
+}
+
+static void
+dump_vnic_stat(const char *name, datalink_id_t vnic_id,
+    show_vnic_state_t *state, pktsum_t *vnic_stats, pktsum_t *tot_stats)
+{
+	pktsum_t	diff_stats;
+	pktsum_t	*old_stats = &state->vs_prevstats[vnic_id];
+
+	dladm_stats_diff(&diff_stats, vnic_stats, old_stats);
+
+	(void) printf("%s", name);
+
+	(void) printf("\t%-10llu", diff_stats.ipackets);
+	(void) printf("%-12llu", diff_stats.rbytes);
+	(void) printf("%-10llu", diff_stats.opackets);
+	(void) printf("%-12llu", diff_stats.obytes);
+
+	if (tot_stats) {
+		if (tot_stats->ipackets == 0) {
+			(void) printf("\t-");
+		} else {
+			(void) printf("\t%-6.1f", (double)diff_stats.ipackets/
+			    (double)tot_stats->ipackets * 100);
+		}
+		if (tot_stats->opackets == 0) {
+			(void) printf("\t-");
+		} else {
+			(void) printf("\t%-6.1f", (double)diff_stats.opackets/
+			    (double)tot_stats->opackets * 100);
+		}
+	}
+	(void) printf("\n");
+
+	*old_stats = *vnic_stats;
+}
+
+/*
+ * Called from the walker dladm_vnic_walk_sys() for each vnic to display
+ * vnic information or statistics.
+ */
+static dladm_status_t
+print_vnic(show_vnic_state_t *state, datalink_id_t linkid)
+{
+	dladm_vnic_attr_t	attr, *vnic = &attr;
+	dladm_status_t		status;
+	boolean_t		is_etherstub;
+	char			devname[MAXLINKNAMELEN];
+	char			vnic_name[MAXLINKNAMELEN];
+	char			mstr[MAXMACADDRLEN * 3];
+	vnic_fields_buf_t	vbuf;
+
+	if ((status = dladm_vnic_info(linkid, vnic, state->vs_flags)) !=
+	    DLADM_STATUS_OK)
+		return (status);
+
+	is_etherstub = (vnic->va_link_id == DATALINK_INVALID_LINKID);
+	if (state->vs_etherstub != is_etherstub) {
+		/*
+		 * Want all etherstub but it's not one, or want
+		 * non-etherstub and it's one.
+		 */
+		return (DLADM_STATUS_OK);
+	}
+
+	if (state->vs_link_id != DATALINK_ALL_LINKID) {
+		if (state->vs_link_id != vnic->va_link_id)
+			return (DLADM_STATUS_OK);
+	}
+
+	if (dladm_datalink_id2info(linkid, NULL, NULL,
+	    NULL, vnic_name, sizeof (vnic_name)) != DLADM_STATUS_OK)
+		return (DLADM_STATUS_BADARG);
+
+	bzero(devname, sizeof (devname));
+	if (!is_etherstub &&
+	    dladm_datalink_id2info(vnic->va_link_id, NULL, NULL,
+	    NULL, devname, sizeof (devname)) != DLADM_STATUS_OK)
+		return (DLADM_STATUS_BADARG);
+
+	state->vs_found = B_TRUE;
+	if (state->vs_stats) {
+		/* print vnic statistics */
+		pktsum_t vnic_stats;
+
+		if (state->vs_firstonly) {
+			if (state->vs_donefirst)
+				return (0);
+			state->vs_donefirst = B_TRUE;
+		}
+
+		if (!state->vs_printstats) {
+			/*
+			 * get vnic statistics and add to the sum for the
+			 * named device.
+			 */
+			get_link_stats(vnic_name, &vnic_stats);
+			dladm_stats_total(&state->vs_totalstats, &vnic_stats,
+			    &state->vs_prevstats[vnic->va_vnic_id]);
+		} else {
+			/* get and print vnic statistics */
+			get_link_stats(vnic_name, &vnic_stats);
+			dump_vnic_stat(vnic_name, linkid, state, &vnic_stats,
+			    &state->vs_totalstats);
+		}
+		return (DLADM_STATUS_OK);
+	} else {
+		(void) snprintf(vbuf.vnic_link, sizeof (vbuf.vnic_link),
+		    "%s", vnic_name);
+
+		if (!is_etherstub) {
+
+			(void) snprintf(vbuf.vnic_over, sizeof (vbuf.vnic_over),
+			    "%s", devname);
+			(void) snprintf(vbuf.vnic_speed,
+			    sizeof (vbuf.vnic_speed), "%u",
+			    (uint_t)((get_ifspeed(vnic_name, B_TRUE))
+			    / 1000000ull));
+
+			switch (vnic->va_mac_addr_type) {
+			case VNIC_MAC_ADDR_TYPE_FIXED:
+			case VNIC_MAC_ADDR_TYPE_PRIMARY:
+				(void) snprintf(vbuf.vnic_macaddrtype,
+				    sizeof (vbuf.vnic_macaddrtype),
+				    gettext("fixed"));
+				break;
+			case VNIC_MAC_ADDR_TYPE_RANDOM:
+				(void) snprintf(vbuf.vnic_macaddrtype,
+				    sizeof (vbuf.vnic_macaddrtype),
+				    gettext("random"));
+				break;
+			case VNIC_MAC_ADDR_TYPE_FACTORY:
+				(void) snprintf(vbuf.vnic_macaddrtype,
+				    sizeof (vbuf.vnic_macaddrtype),
+				    gettext("factory, slot %d"),
+				    vnic->va_mac_slot);
+				break;
+			}
+
+			if (strlen(vbuf.vnic_macaddrtype) > 0) {
+				(void) snprintf(vbuf.vnic_macaddr,
+				    sizeof (vbuf.vnic_macaddr), "%s",
+				    dladm_aggr_macaddr2str(vnic->va_mac_addr,
+				    mstr));
+			}
+
+			(void) snprintf(vbuf.vnic_vid, sizeof (vbuf.vnic_vid),
+			    "%d", vnic->va_vid);
+		}
+
+		if (!state->vs_parseable && !state->vs_printheader) {
+			print_header(&state->vs_print);
+			state->vs_printheader = B_TRUE;
+		}
+
+		dladm_print_output(&state->vs_print, state->vs_parseable,
+		    dladm_print_field, (void *)&vbuf);
+
+		return (DLADM_STATUS_OK);
+	}
+}
+
+static int
+show_vnic(datalink_id_t linkid, void *arg)
+{
+	show_vnic_state_t	*state = arg;
+
+	state->vs_status = print_vnic(state, linkid);
+	return (DLADM_WALK_CONTINUE);
+}
+
+static void
+do_show_vnic_common(int argc, char *argv[], const char *use,
+    boolean_t etherstub)
+{
+	int			option;
+	boolean_t		s_arg = B_FALSE;
+	boolean_t		i_arg = B_FALSE;
+	boolean_t		l_arg = B_FALSE;
+	char			*endp = NULL;
+	uint32_t		interval = 0, flags = DLADM_OPT_ACTIVE;
+	datalink_id_t		linkid = DATALINK_ALL_LINKID;
+	datalink_id_t		dev_linkid = DATALINK_ALL_LINKID;
+	show_vnic_state_t	state;
+	dladm_status_t		status;
+	boolean_t		o_arg = B_FALSE;
+	char			*fields_str = NULL;
+	print_field_t  		**fields;
+	print_field_t		*pf;
+	int			pfmax;
+	uint_t			nfields;
+	char			*all_fields =
+	    "link,over,speed,macaddr,macaddrtype,vid";
+	char			*all_e_fields =
+	    "link";
+
+	bzero(&state, sizeof (state));
+	opterr = 0;
+	while ((option = getopt_long(argc, argv, ":pPl:si:o:", lopts,
+	    NULL)) != -1) {
+		switch (option) {
+		case 'p':
+			state.vs_parseable = B_TRUE;
+			break;
+		case 'P':
+			flags = DLADM_OPT_PERSIST;
+			break;
+		case 'l':
+			if (etherstub)
+				die("option not supported for this command");
+
+			if (strlcpy(state.vs_link, optarg, MAXLINKNAMELEN) >=
+			    MAXLINKNAMELEN)
+				die("link name too long");
+
+			l_arg = B_TRUE;
+			break;
+		case 's':
+			if (s_arg) {
+				die("the option -s cannot be specified "
+				    "more than once");
+			}
+			s_arg = B_TRUE;
+			break;
+		case 'i':
+			if (i_arg) {
+				die("the option -i cannot be specified "
+				    "more than once");
+			}
+			i_arg = B_TRUE;
+			interval = (int)strtol(optarg, &endp, 10);
+			if (errno != 0 || interval == 0 || *endp != '\0')
+				die("invalid interval value '%s'", optarg);
+			break;
+		case 'o':
+			o_arg = B_TRUE;
+			fields_str = optarg;
+			break;
+		default:
+			die_opterr(optopt, option, use);
+		}
+	}
+
+	if (i_arg && !s_arg)
+		die("the option -i can be used only with -s");
+
+	/* get vnic ID (optional last argument) */
+	if (optind == (argc - 1)) {
+		status = dladm_name2info(argv[optind], &linkid, NULL,
+		    NULL, NULL);
+		if (status != DLADM_STATUS_OK) {
+			die_dlerr(status, "invalid vnic name '%s'",
+			    argv[optind]);
+		}
+		(void) strlcpy(state.vs_vnic, argv[optind], MAXLINKNAMELEN);
+	} else if (optind != argc) {
+		usage();
+	}
+
+	if (l_arg) {
+		status = dladm_name2info(state.vs_link, &dev_linkid, NULL,
+		    NULL, NULL);
+		if (status != DLADM_STATUS_OK) {
+			die_dlerr(status, "invalid link name '%s'",
+			    state.vs_link);
+		}
+	}
+
+	state.vs_vnic_id = linkid;
+	state.vs_link_id = dev_linkid;
+	state.vs_etherstub = etherstub;
+	state.vs_found = B_FALSE;
+	state.vs_flags = flags;
+
+	if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) {
+		if (etherstub)
+			fields_str = all_e_fields;
+		else
+			fields_str = all_fields;
+	}
+
+	pf = vnic_fields;
+	pfmax = VNIC_MAX_FIELDS;
+
+	fields = parse_output_fields(fields_str, pf, pfmax, CMD_TYPE_ANY,
+	    &nfields);
+
+	if (fields == NULL) {
+		die("invalid field(s) specified");
+		return;
+	}
+
+	state.vs_print.ps_fields = fields;
+	state.vs_print.ps_nfields = nfields;
+
+	if (s_arg) {
+		/* Display vnic statistics */
+		vnic_stats(&state, interval);
+		return;
+	}
+
+	/* Display vnic information */
+	state.vs_donefirst = B_FALSE;
+
+	if (linkid == DATALINK_ALL_LINKID) {
+		(void) dladm_walk_datalink_id(show_vnic, &state,
+		    DATALINK_CLASS_VNIC | DATALINK_CLASS_ETHERSTUB,
+		    DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE);
+	} else {
+		(void) show_vnic(linkid, &state);
+		if (state.vs_status != DLADM_STATUS_OK) {
+			die_dlerr(state.vs_status, "failed to show vnic '%s'",
+			    state.vs_vnic);
+		}
+	}
+}
+
+static void
+do_show_vnic(int argc, char *argv[], const char *use)
+{
+	do_show_vnic_common(argc, argv, use, B_FALSE);
+}
+
+static void
+do_create_etherstub(int argc, char *argv[], const char *use)
+{
+	uint32_t flags;
+	char *altroot = NULL;
+	char option;
+	dladm_status_t status;
+	char name[MAXLINKNAMELEN];
+	uchar_t mac_addr[ETHERADDRL];
+
+	name[0] = '\0';
+	bzero(mac_addr, sizeof (mac_addr));
+	flags = DLADM_OPT_ANCHOR | DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST;
+
+	opterr = 0;
+	while ((option = getopt_long(argc, argv, "tR:",
+	    etherstub_lopts, NULL)) != -1) {
+		switch (option) {
+		case 't':
+			flags &= ~DLADM_OPT_PERSIST;
+			break;
+		case 'R':
+			altroot = optarg;
+			break;
+		default:
+			die_opterr(optopt, option, use);
+		}
+	}
+
+	/* the etherstub id is the required operand */
+	if (optind != (argc - 1))
+		usage();
+
+	if (strlcpy(name, argv[optind], MAXLINKNAMELEN) >= MAXLINKNAMELEN)
+		die("link name too long '%s'", argv[optind]);
+
+	if (!dladm_valid_linkname(name))
+		die("invalid link name '%s'", argv[optind]);
+
+	if (altroot != NULL)
+		altroot_cmd(altroot, argc, argv);
+
+	status = dladm_vnic_create(name, DATALINK_INVALID_LINKID,
+	    VNIC_MAC_ADDR_TYPE_AUTO, mac_addr, ETHERADDRL, NULL, 0, 0, NULL,
+	    NULL, flags);
+	if (status != DLADM_STATUS_OK)
+		die_dlerr(status, "etherstub creation failed");
+
+
+}
+
+static void
+do_delete_etherstub(int argc, char *argv[], const char *use)
+{
+	do_delete_vnic_common(argc, argv, use, B_TRUE);
+}
+
+/* ARGSUSED */
+static void
+do_show_etherstub(int argc, char *argv[], const char *use)
+{
+	do_show_vnic_common(argc, argv, use, B_TRUE);
+}
+
+static void
 link_stats(datalink_id_t linkid, uint_t interval, char *fields_str,
     show_state_t *state)
 {
@@ -3333,147 +4425,134 @@ aggr_stats(datalink_id_t linkid, show_grp_state_t *state, uint_t interval)
 	}
 }
 
+/* ARGSUSED */
 static void
-dev_stats(const char *dev, uint32_t interval, char *fields_str,
-    show_state_t *state)
+vnic_stats(show_vnic_state_t *sp, uint32_t interval)
 {
-	print_field_t	**fields;
-	uint_t		nfields;
-
-	fields = parse_output_fields(fields_str, devs_fields, DEVS_MAX_FIELDS,
-	    CMD_TYPE_ANY, &nfields);
+	show_vnic_state_t	state;
+	boolean_t		specific_link, specific_dev;
 
-	if (fields == NULL) {
-		die("invalid field(s) specified");
-		return;
-	}
-
-	state->ls_print.ps_fields = fields;
-	state->ls_print.ps_nfields = nfields;
+	/* Display vnic statistics */
+	dump_vnics_head(sp->vs_link);
 
+	bzero(&state, sizeof (state));
+	state.vs_stats = B_TRUE;
+	state.vs_vnic_id = sp->vs_vnic_id;
+	state.vs_link_id = sp->vs_link_id;
 
 	/*
-	 * If an interval is specified, continuously show the stats
-	 * only for the first MAC port.
+	 * If an interval is specified, and a vnic ID is not specified,
+	 * continuously show the stats only for the first vnic.
 	 */
-	state->ls_firstonly = (interval != 0);
+	specific_link = (sp->vs_vnic_id != DATALINK_ALL_LINKID);
+	specific_dev = (sp->vs_link_id != DATALINK_ALL_LINKID);
 
 	for (;;) {
+		/* Get stats for each vnic */
+		state.vs_found = B_FALSE;
+		state.vs_donefirst = B_FALSE;
+		state.vs_printstats = B_FALSE;
+		state.vs_flags = DLADM_OPT_ACTIVE;
+
+		if (!specific_link) {
+			(void) dladm_walk_datalink_id(show_vnic, &state,
+			    DATALINK_CLASS_VNIC, DATALINK_ANY_MEDIATYPE,
+			    DLADM_OPT_ACTIVE);
+		} else {
+			(void) show_vnic(sp->vs_vnic_id, &state);
+			if (state.vs_status != DLADM_STATUS_OK) {
+				die_dlerr(state.vs_status,
+				    "failed to show vnic '%s'", sp->vs_vnic);
+			}
+		}
 
-		if (!state->ls_parseable)
-			print_header(&state->ls_print);
-		state->ls_donefirst = B_FALSE;
+		if (specific_link && !state.vs_found)
+			die("non-existent vnic '%s'", sp->vs_vnic);
+		if (specific_dev && !state.vs_found)
+			die("device %s has no vnics", sp->vs_link);
+
+		/* Show totals */
+		if ((specific_link | specific_dev) && !interval) {
+			(void) printf("Total");
+			(void) printf("\t%-10llu",
+			    state.vs_totalstats.ipackets);
+			(void) printf("%-12llu",
+			    state.vs_totalstats.rbytes);
+			(void) printf("%-10llu",
+			    state.vs_totalstats.opackets);
+			(void) printf("%-12llu\n",
+			    state.vs_totalstats.obytes);
+		}
 
-		if (dev == NULL)
-			(void) dladm_mac_walk(show_dev_stats, state);
-		else
-			(void) show_dev_stats(dev, state);
+		/* Show stats for each vnic */
+		state.vs_donefirst = B_FALSE;
+		state.vs_printstats = B_TRUE;
+
+		if (!specific_link) {
+			(void) dladm_walk_datalink_id(show_vnic, &state,
+			    DATALINK_CLASS_VNIC, DATALINK_ANY_MEDIATYPE,
+			    DLADM_OPT_ACTIVE);
+		} else {
+			(void) show_vnic(sp->vs_vnic_id, &state);
+			if (state.vs_status != DLADM_STATUS_OK) {
+				die_dlerr(state.vs_status,
+				    "failed to show vnic '%s'", sp->vs_vnic);
+			}
+		}
 
 		if (interval == 0)
 			break;
 
 		(void) sleep(interval);
 	}
-
-	if (dev != NULL && state->ls_status != DLADM_STATUS_OK)
-		die_dlerr(state->ls_status, "cannot show device '%s'", dev);
 }
 
-/* accumulate stats (s1 += (s2 - s3)) */
 static void
-stats_total(pktsum_t *s1, pktsum_t *s2, pktsum_t *s3)
-{
-	s1->ipackets += (s2->ipackets - s3->ipackets);
-	s1->opackets += (s2->opackets - s3->opackets);
-	s1->rbytes += (s2->rbytes - s3->rbytes);
-	s1->obytes += (s2->obytes - s3->obytes);
-	s1->ierrors += (s2->ierrors - s3->ierrors);
-	s1->oerrors += (s2->oerrors - s3->oerrors);
-}
-
-/* compute stats differences (s1 = s2 - s3) */
-static void
-stats_diff(pktsum_t *s1, pktsum_t *s2, pktsum_t *s3)
-{
-	s1->ipackets = s2->ipackets - s3->ipackets;
-	s1->opackets = s2->opackets - s3->opackets;
-	s1->rbytes = s2->rbytes - s3->rbytes;
-	s1->obytes = s2->obytes - s3->obytes;
-	s1->ierrors = s2->ierrors - s3->ierrors;
-	s1->oerrors = s2->oerrors - s3->oerrors;
-}
-
-static void
-get_stats(char *module, int instance, const char *name, pktsum_t *stats)
+get_mac_stats(const char *dev, pktsum_t *stats)
 {
 	kstat_ctl_t	*kcp;
 	kstat_t		*ksp;
+	char module[DLPI_LINKNAME_MAX];
+	uint_t instance;
 
-	if ((kcp = kstat_open()) == NULL) {
-		warn("kstat open operation failed");
+
+	bzero(stats, sizeof (*stats));
+
+	if (dlpi_parselink(dev, module, &instance) != DLPI_SUCCESS)
 		return;
-	}
 
-	if ((ksp = kstat_lookup(kcp, module, instance, (char *)name)) == NULL) {
-		/*
-		 * The kstat query could fail if the underlying MAC
-		 * driver was already detached.
-		 */
-		(void) kstat_close(kcp);
+	if ((kcp = kstat_open()) == NULL) {
+		warn("kstat open operation failed");
 		return;
 	}
 
-	if (kstat_read(kcp, ksp, NULL) == -1)
-		goto bail;
-
-	if (dladm_kstat_value(ksp, "ipackets64", KSTAT_DATA_UINT64,
-	    &stats->ipackets) < 0)
-		goto bail;
-
-	if (dladm_kstat_value(ksp, "opackets64", KSTAT_DATA_UINT64,
-	    &stats->opackets) < 0)
-		goto bail;
-
-	if (dladm_kstat_value(ksp, "rbytes64", KSTAT_DATA_UINT64,
-	    &stats->rbytes) < 0)
-		goto bail;
-
-	if (dladm_kstat_value(ksp, "obytes64", KSTAT_DATA_UINT64,
-	    &stats->obytes) < 0)
-		goto bail;
-
-	if (dladm_kstat_value(ksp, "ierrors", KSTAT_DATA_UINT32,
-	    &stats->ierrors) < 0)
-		goto bail;
-
-	if (dladm_kstat_value(ksp, "oerrors", KSTAT_DATA_UINT32,
-	    &stats->oerrors) < 0)
-		goto bail;
+	ksp = dladm_kstat_lookup(kcp, module, instance, "mac", NULL);
+	if (ksp != NULL)
+		dladm_get_stats(kcp, ksp, stats);
 
-bail:
 	(void) kstat_close(kcp);
-	return;
 
 }
 
 static void
-get_mac_stats(const char *dev, pktsum_t *stats)
+get_link_stats(const char *link, pktsum_t *stats)
 {
-	char module[DLPI_LINKNAME_MAX];
-	uint_t instance;
+	kstat_ctl_t	*kcp;
+	kstat_t		*ksp;
 
 	bzero(stats, sizeof (*stats));
-	if (dlpi_parselink(dev, module, &instance) != DLPI_SUCCESS)
+
+	if ((kcp = kstat_open()) == NULL) {
+		warn("kstat_open operation failed");
 		return;
+	}
 
-	get_stats(module, instance, "mac", stats);
-}
+	ksp = dladm_kstat_lookup(kcp, "link", 0, link, NULL);
 
-static void
-get_link_stats(const char *link, pktsum_t *stats)
-{
-	bzero(stats, sizeof (*stats));
-	get_stats("link", 0, link, stats);
+	if (ksp != NULL)
+		dladm_get_stats(kcp, ksp, stats);
+
+	(void) kstat_close(kcp);
 }
 
 static int
@@ -3547,7 +4626,7 @@ get_linkstate(const char *name, boolean_t islink, char *buf)
 
 	if (get_one_kstat(name, "link_state", KSTAT_DATA_UINT32,
 	    &linkstate, islink) != 0) {
-		(void) strlcpy(buf, "unknown", DLADM_STRSIZE);
+		(void) strlcpy(buf, "?", DLADM_STRSIZE);
 		return (buf);
 	}
 	return (dladm_linkstate2str(linkstate, buf));
@@ -4271,92 +5350,6 @@ do_disconnect_wifi(int argc, char **argv, const char *use)
 		die_dlerr(status, "cannot disconnect");
 }
 
-
-static void
-free_props(prop_list_t *list)
-{
-	if (list != NULL) {
-		free(list->pl_buf);
-		free(list);
-	}
-}
-
-static int
-parse_props(char *str, prop_list_t **listp, boolean_t novalues)
-{
-	prop_list_t	*list;
-	prop_info_t	*pip;
-	char		*buf, *curr;
-	int		len, i;
-
-	list = malloc(sizeof (prop_list_t));
-	if (list == NULL)
-		return (-1);
-
-	list->pl_count = 0;
-	list->pl_buf = buf = strdup(str);
-	if (buf == NULL)
-		goto fail;
-
-	/*
-	 * buf is a string of form [<propname>=<value>][,<propname>=<value>]+
-	 * where each <value> string itself could be a comma-separated array.
-	 * The loop below will count the number of propname assignments
-	 * in pl_count; for each property, there is a pip entry with
-	 * pi_name == <propname>, pi_count == # of elements in <value> array.
-	 * pi_val[] contains the actual values.
-	 *
-	 * This could really be a combination of  calls to
-	 * strtok (token delimiter is ",") and strchr (chr '=')
-	 * with appropriate null/string-bound-checks.
-	 */
-
-	curr = buf;
-	len = strlen(buf);
-	pip = NULL;
-	for (i = 0; i < len; i++) {
-		char		c = buf[i];
-		boolean_t	match = (c == '=' || c == ',');
-
-		if (!match && i != len - 1)
-			continue;
-
-		if (match) {
-			buf[i] = '\0';
-			if (*curr == '\0')
-				goto fail;
-		}
-
-		if (pip != NULL && c != '=') {
-			if (pip->pi_count > DLADM_MAX_PROP_VALCNT)
-				goto fail;
-
-			if (novalues)
-				goto fail;
-
-			pip->pi_val[pip->pi_count] = curr;
-			pip->pi_count++;
-		} else {
-			if (list->pl_count > MAX_PROPS)
-				goto fail;
-
-			pip = &list->pl_info[list->pl_count];
-			pip->pi_name = curr;
-			pip->pi_count = 0;
-			list->pl_count++;
-			if (c == ',')
-				pip = NULL;
-		}
-		curr = buf + i + 1;
-	}
-	*listp = list;
-	return (0);
-
-fail:
-	free_props(list);
-	return (-1);
-}
-
 static void
 print_linkprop(datalink_id_t linkid, show_linkprop_state_t *statep,
     const char *propname, dladm_prop_type_t type,
@@ -4365,7 +5358,7 @@ print_linkprop(datalink_id_t linkid, show_linkprop_state_t *statep,
 	int		i;
 	char		*ptr, *lim;
 	char		buf[DLADM_STRSIZE];
-	char		*unknown = "?", *notsup = "";
+	char		*unknown = "--", *notsup = "";
 	char		**propvals = statep->ls_propvals;
 	uint_t		valcnt = DLADM_MAX_PROP_VALCNT;
 	dladm_status_t	status;
@@ -4545,7 +5538,7 @@ static void
 do_show_linkprop(int argc, char **argv, const char *use)
 {
 	int			option;
-	prop_list_t		*proplist = NULL;
+	dladm_arg_list_t	*proplist = NULL;
 	datalink_id_t		linkid = DATALINK_ALL_LINKID;
 	show_linkprop_state_t	state;
 	uint32_t		flags = DLADM_OPT_ACTIVE;
@@ -4570,7 +5563,8 @@ do_show_linkprop(int argc, char **argv, const char *use)
 	    prop_longopts, NULL)) != -1) {
 		switch (option) {
 		case 'p':
-			if (parse_props(optarg, &proplist, B_TRUE) < 0)
+			if (dladm_parse_link_props(optarg, &proplist, B_TRUE)
+			    != DLADM_STATUS_OK)
 				die("invalid link properties specified");
 			break;
 		case 'c':
@@ -4628,7 +5622,7 @@ do_show_linkprop(int argc, char **argv, const char *use)
 	} else {
 		(void) show_linkprop_onelink(linkid, &state);
 	}
-	free_props(proplist);
+	dladm_free_props(proplist);
 
 	if (state.ls_retstatus != DLADM_STATUS_OK)
 		exit(EXIT_FAILURE);
@@ -4640,7 +5634,7 @@ show_linkprop_onelink(datalink_id_t linkid, void *arg)
 	int			i;
 	char			*buf;
 	uint32_t		flags;
-	prop_list_t		*proplist = NULL;
+	dladm_arg_list_t	*proplist = NULL;
 	show_linkprop_state_t	*statep = arg;
 	dlpi_handle_t		dh = NULL;
 
@@ -4689,9 +5683,9 @@ show_linkprop_onelink(datalink_id_t linkid, void *arg)
 	    (sizeof (char *) + DLADM_PROP_VAL_MAX) * DLADM_MAX_PROP_VALCNT;
 
 	if (proplist != NULL) {
-		for (i = 0; i < proplist->pl_count; i++) {
+		for (i = 0; i < proplist->al_count; i++) {
 			(void) show_linkprop(linkid,
-			    proplist->pl_info[i].pi_name, statep);
+			    proplist->al_info[i].ai_name, statep);
 		}
 	} else {
 		(void) dladm_walk_linkprop(linkid, statep, show_linkprop);
@@ -4712,30 +5706,58 @@ set_linkprop_persist(datalink_id_t linkid, const char *prop_name,
 	    DLADM_OPT_PERSIST);
 
 	if (status != DLADM_STATUS_OK) {
-		warn_dlerr(status, "cannot persistently %s link property",
-		    reset ? "reset" : "set");
+		warn_dlerr(status, "cannot persistently %s link property '%s'",
+		    reset ? "reset" : "set", prop_name);
 	}
 	return (status);
 }
 
+static int
+reset_one_linkprop(datalink_id_t linkid, const char *propname, void *arg)
+{
+	set_linkprop_state_t	*statep = arg;
+	dladm_status_t		status;
+
+	status = dladm_set_linkprop(linkid, propname, NULL, 0,
+	    DLADM_OPT_ACTIVE);
+	if (status != DLADM_STATUS_OK) {
+		warn_dlerr(status, "cannot reset link property '%s' on '%s'",
+		    propname, statep->ls_name);
+	}
+	if (!statep->ls_temp) {
+		dladm_status_t	s;
+
+		s = set_linkprop_persist(linkid, propname, NULL, 0,
+		    statep->ls_reset);
+		if (s != DLADM_STATUS_OK)
+			status = s;
+	}
+	if (status != DLADM_STATUS_OK)
+		statep->ls_status = status;
+
+	return (DLADM_WALK_CONTINUE);
+}
+
 static void
 set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
 {
-	int		i, option;
-	char		errmsg[DLADM_STRSIZE];
-	char		*altroot = NULL;
-	datalink_id_t	linkid;
-	prop_list_t	*proplist = NULL;
-	boolean_t	temp = B_FALSE;
-	dladm_status_t	status = DLADM_STATUS_OK;
+	int			i, option;
+	char			errmsg[DLADM_STRSIZE];
+	char			*altroot = NULL;
+	datalink_id_t		linkid;
+	boolean_t		temp = B_FALSE;
+	dladm_status_t		status = DLADM_STATUS_OK;
+	dladm_arg_list_t	*proplist = NULL;
 
 	opterr = 0;
 	while ((option = getopt_long(argc, argv, ":p:R:t",
 	    prop_longopts, NULL)) != -1) {
 		switch (option) {
 		case 'p':
-			if (parse_props(optarg, &proplist, reset) < 0)
+			if (dladm_parse_link_props(optarg, &proplist, reset) !=
+			    DLADM_STATUS_OK) {
 				die("invalid link properties specified");
+			}
 			break;
 		case 't':
 			temp = B_TRUE;
@@ -4757,7 +5779,7 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
 		die("link property must be specified");
 
 	if (altroot != NULL) {
-		free_props(proplist);
+		dladm_free_props(proplist);
 		altroot_cmd(altroot, argc, argv);
 	}
 
@@ -4766,24 +5788,21 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
 		die_dlerr(status, "link %s is not valid", argv[optind]);
 
 	if (proplist == NULL) {
-		status = dladm_set_linkprop(linkid, NULL, NULL, 0,
-		    DLADM_OPT_ACTIVE);
-		if (status != DLADM_STATUS_OK) {
-			warn_dlerr(status, "cannot reset link property "
-			    "on '%s'", argv[optind]);
-		}
-		if (!temp) {
-			dladm_status_t	s;
+		set_linkprop_state_t	state;
 
-			s = set_linkprop_persist(linkid, NULL, NULL, 0, reset);
-			if (s != DLADM_STATUS_OK)
-				status = s;
-		}
+		state.ls_name = argv[optind];
+		state.ls_reset = reset;
+		state.ls_temp = temp;
+		state.ls_status = DLADM_STATUS_OK;
+
+		(void) dladm_walk_linkprop(linkid, &state, reset_one_linkprop);
+
+		status = state.ls_status;
 		goto done;
 	}
 
-	for (i = 0; i < proplist->pl_count; i++) {
-		prop_info_t	*pip = &proplist->pl_info[i];
+	for (i = 0; i < proplist->al_count; i++) {
+		dladm_arg_info_t	*aip = &proplist->al_info[i];
 		char		**val;
 		uint_t		count;
 		dladm_status_t	s;
@@ -4792,21 +5811,21 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
 			val = NULL;
 			count = 0;
 		} else {
-			val = pip->pi_val;
-			count = pip->pi_count;
+			val = aip->ai_val;
+			count = aip->ai_count;
 			if (count == 0) {
 				warn("no value specified for '%s'",
-				    pip->pi_name);
+				    aip->ai_name);
 				status = DLADM_STATUS_BADARG;
 				continue;
 			}
 		}
-		s = dladm_set_linkprop(linkid, pip->pi_name, val, count,
+		s = dladm_set_linkprop(linkid, aip->ai_name, val, count,
 		    DLADM_OPT_ACTIVE);
 		if (s == DLADM_STATUS_OK) {
 			if (!temp) {
 				s = set_linkprop_persist(linkid,
-				    pip->pi_name, val, count, reset);
+				    aip->ai_name, val, count, reset);
 				if (s != DLADM_STATUS_OK)
 					status = s;
 			}
@@ -4815,7 +5834,7 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
 		status = s;
 		switch (s) {
 		case DLADM_STATUS_NOTFOUND:
-			warn("invalid link property '%s'", pip->pi_name);
+			warn("invalid link property '%s'", aip->ai_name);
 			break;
 		case DLADM_STATUS_BADVAL: {
 			int		j;
@@ -4837,12 +5856,12 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
 				    j * DLADM_PROP_VAL_MAX;
 			}
 			s = dladm_get_linkprop(linkid,
-			    DLADM_PROP_VAL_MODIFIABLE, pip->pi_name, propvals,
+			    DLADM_PROP_VAL_MODIFIABLE, aip->ai_name, propvals,
 			    &valcnt);
 
 			if (s != DLADM_STATUS_OK) {
 				warn_dlerr(status, "cannot set link property "
-				    "'%s' on '%s'", pip->pi_name, argv[optind]);
+				    "'%s' on '%s'", aip->ai_name, argv[optind]);
 				free(propvals);
 				break;
 			}
@@ -4859,7 +5878,7 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
 			if (ptr > errmsg) {
 				*(ptr - 1) = '\0';
 				warn("link property '%s' must be one of: %s",
-				    pip->pi_name, errmsg);
+				    aip->ai_name, errmsg);
 			} else
 				warn("invalid link property '%s'", *val);
 			free(propvals);
@@ -4868,16 +5887,16 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use)
 		default:
 			if (reset) {
 				warn_dlerr(status, "cannot reset link property "
-				    "'%s' on '%s'", pip->pi_name, argv[optind]);
+				    "'%s' on '%s'", aip->ai_name, argv[optind]);
 			} else {
 				warn_dlerr(status, "cannot set link property "
-				    "'%s' on '%s'", pip->pi_name, argv[optind]);
+				    "'%s' on '%s'", aip->ai_name, argv[optind]);
 			}
 			break;
 		}
 	}
 done:
-	free_props(proplist);
+	dladm_free_props(proplist);
 	if (status != DLADM_STATUS_OK)
 		exit(1);
 }
@@ -5414,7 +6433,7 @@ i_dladm_init_linkprop(datalink_id_t linkid, void *arg)
 }
 
 /*ARGSUSED*/
-static void
+void
 do_init_linkprop(int argc, char **argv, const char *use)
 {
 	int			option;
@@ -5890,6 +6909,7 @@ show_ether_xprop(datalink_id_t linkid, void *arg)
 	(void) snprintf(ebuf.eth_ptype, sizeof (ebuf.eth_ptype),
 	    "%s", "peeradv");
 	(void) snprintf(ebuf.eth_state, sizeof (ebuf.eth_state), "");
+
 	(void) dladm_get_single_mac_stat(linkid, "lp_cap_autoneg",
 	    KSTAT_DATA_UINT32, &autoneg);
 	(void) snprintf(ebuf.eth_autoneg, sizeof (ebuf.eth_autoneg),
diff --git a/usr/src/cmd/dladm/dladm.xcl b/usr/src/cmd/dladm/dladm.xcl
index b849b22f79..09192c7f4d 100644
--- a/usr/src/cmd/dladm/dladm.xcl
+++ b/usr/src/cmd/dladm/dladm.xcl
@@ -21,244 +21,343 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
 #
-msgid  "	   %-9s\t%s"
-msgid  "	   %s"
-msgid  "	   Total"
-msgid  " address-type=%s\n"
-msgid  " address=%s"
-msgid  " device=%s address=%s"
-msgid  " duplex=%s"
-msgid  " duplex=%s\n"
-msgid  " lacp-mode=%s"
-msgid  " lacp-timer=%s\n"
-msgid  " link=%s"
-msgid  " policy=%s"
-msgid  " port=%s"
-msgid  " speed=%u"
+
 msgid  ""
-msgid  "%%ipkts	%%opkts\n"
-msgid  "%-*s "
+msgid  "\t%-10llu"
+msgid  "\t%-6.1f"
+msgid  "\t-"
+msgid  "\tipackets  rbytes      opackets  obytes          "
+msgid  "\n"
+msgid  " "
+msgid  "       "
+msgid  "  %-18s"
+msgid  "  MACADDRESS"
+msgid  " %-18s"
+msgid  " MACADDRTYPE"
+msgid  " dev=%s"
+msgid  " mac_addr=%s"
+msgid  " speed=%u"
+msgid  " vid=%d\n"
+msgid  "%%ipkts  %%opkts\n"
 msgid  "%-*s"
 msgid  "%-10llu"
 msgid  "%-12llu"
 msgid  "%-12llu\n"
-msgid  "%-14s "
-msgid  "%-15s "
-msgid  "%-15s %-14s %-14s %-30s \n"
-msgid  "%-20s %-20s "
-msgid  "%-30s "
-msgid  "%-30s"
-msgid  "%-8u"
-msgid  "%-8u\n"
-msgid  "%s type=%s mtu=%d device=%s\n"
-msgid  "%s type=%s mtu=%d key=%u\n"
-msgid  "%s type=legacy mtu=%d device=%s\n"
+msgid  "%-12s"
+msgid  "%-12s%-10s%-12s%-8s%-10s%-12s%-8s\n"
+msgid  "%-12s%-12s"
+msgid  "%-12s%-12s%10s%-20s%-19s%-6s\n"
+msgid  "%-12s%8d  %-12s%-20s %6d\n"
+msgid  "%-12s%8s  %-12s%-20s %6s\n"
+msgid  "%-6.1f"
+msgid  "%-6d\n"
+msgid  "%-8llu"
+msgid  "%-8llu\n"
+msgid  "%5u Mbps"
+msgid  "%c----"
+msgid  "%d"
+msgid  "%d%c-%c"
+msgid  "%llu"
 msgid  "%s"
+msgid  "%s\n"
+msgid  "%s "
 msgid  "%s,"
 msgid  "%s: "
-msgid  "%s=\"%s\" "
 msgid  "%s=\"%s\""
-msgid  "%s\n"
+msgid  "%sfdx"
+msgid  "%shdx"
+msgid  "%u"
+msgid  "%uMb"
 msgid  ","
-msgid  "--"
-msgid  "--,"
-msgid  "/dev/%s"
+msgid  "-R"
+msgid  "-f"
+msgid  "-fh"
+msgid  "-h"
+msgid  "/"
+msgid  "/%s"
+msgid  "/%s/%s"
+msgid  "/sbin/dladm "
 msgid  "0x"
-msgid  "0x%-30s"
-msgid  ": %s (%s)\n"
+msgid  "100M"
+msgid  "10M"
+msgid  "1G"
 msgid  ": %s\n"
-msgid  ":Lpsi:"
+msgid  ":L:l:P:R:tu:T:"
+msgid  ":LpPxsi:o:"
+msgid  ":R:"
 msgid  ":R:t"
 msgid  ":a"
-msgid  ":d:R:t"
-msgid  ":d:l:P:R:tu:T:"
+msgid  ":d:l:L:P:R:tfu:T:"
+msgid  ":d:l:R:t"
+msgid  ":d:l:R:tf"
 msgid  ":e:i:a:m:b:s:k:T:c"
 msgid  ":f:c:R:t"
-msgid  ":l:P:R:tu:T:"
 msgid  ":o:p"
 msgid  ":p:R:t"
-msgid  ":p:cP"
-msgid  ":pPd"
-msgid  ":psi:"
+msgid  ":p:cPo:"
+msgid  ":pPo:"
+msgid  ":pPo:m"
+msgid  ":pPsSi:o:"
+msgid  ":psi:o:"
+msgid  ":tl:v:p:"
 msgid  "?"
-msgid  "100M"
+msgid  "ADDRESS"
+msgid  "ADDRPOLICY"
 msgid  "ADT_dladm_create_secobj"
 msgid  "ADT_dladm_delete_secobj"
+msgid  "AGGREGATABLE"
 msgid  "AUTH"
 msgid  "AUTO"
 msgid  "BSSID/IBSSID"
 msgid  "BSSTYPE"
 msgid  "CLASS"
+msgid  "CLIENT"
+msgid  "COLL"
 msgid  "DEFAULT"
+msgid  "DEFAULTED"
+msgid  "DEVICE"
+msgid  "DIST"
 msgid  "DUPLEX"
 msgid  "ESSID"
+msgid  "EXPIRED"
+msgid  "FLAGS"
+msgid  "IERRORS"
+msgid  "INUSE"
+msgid  "IPACKETS"
+msgid  "IPKTDIST"
+msgid  "LACPACTIVITY"
+msgid  "LACPTIMER"
 msgid  "LINK"
+msgid  "LINK\n"
+msgid  "LINKID"
+msgid  "MEDIA"
 msgid  "MODE"
+msgid  "MTU"
 msgid  "Mb"
+msgid  "NAME"
 msgid  "OBJECT"
-msgid  "OBJECT=\"%s\" CLASS=\"%s\" "
+msgid  "OBYTES"
+msgid  "OERRORS"
+msgid  "OPACKETS"
+msgid  "OPKTDIST"
+msgid  "OVER"
 msgid  "PAUSE"
+msgid  "POLICY"
+msgid  "PORT"
+msgid  "PORTSTATE"
 msgid  "POSSIBLE"
 msgid  "PROPERTY"
-msgid  "PROPERTY=\"%s\" "
+msgid  "PTYPE"
+msgid  "RBYTES"
 msgid  "REM_FAULT"
 msgid  "SEC"
+msgid  "SLOT"
 msgid  "SPEED"
 msgid  "SPEED-DUPLEX"
+msgid  "STATE"
 msgid  "STATUS"
 msgid  "STRENGTH"
+msgid  "SYNC"
+msgid  "Total"
 msgid  "VALUE"
-msgid  "VALUE=\"0x%s\""
-msgid  "\n"
-msgid  "\t  %5uMb"
-msgid  "\t%-10llu"
-msgid  "\t%-6.1f"
-msgid  "\t%s"
-msgid  "\t%s\n"
-msgid  "\t-"
-msgid  "\t\t%-10llu"
-msgid  "\t\tipackets  rbytes	 ierrors "
-msgid  "\tipackets  rbytes      opackets	 obytes		 "
-msgid  "active"
+msgid  "VID"
+msgid  "a+"
+msgid  "add-aggr"
+msgid  "address"
+msgid  "addrpolicy"
+msgid  "adt_alloc_event (%s): %s"
+msgid  "adt_start_session: %s"
+msgid  "adv"
 msgid  "adv_cap_10"
 msgid  "adv_cap_100"
 msgid  "adv_cap_1000"
 msgid  "adv_cap_asmpause"
 msgid  "adv_cap_autoneg"
 msgid  "adv_cap_pause"
-msgid  "add-aggr"
-msgid  "adt_alloc_event (%s): %s"
-msgid  "adt_start_session: %s"
-msgid  "aggr key=%d"
-msgid  "aggr"
+msgid  "adv_rem_fault"
+msgid  "aggr%d"
+msgid  "aggregatable"
 msgid  "all"
 msgid  "all-links"
-msgid  "attached"
 msgid  "auth"
 msgid  "auto"
+msgid  "bi"
 msgid  "bssid"
 msgid  "bsstype"
-msgid  "cap_pause"
+msgid  "bw-limit"
 msgid  "cap_10"
+msgid  "cap_100"
 msgid  "cap_1000"
+msgid  "cap_asmpause"
 msgid  "cap_autoneg"
+msgid  "cap_pause"
+msgid  "cap_rem_fault"
 msgid  "capable"
+msgid  "class"
+msgid  "client"
+msgid  "coll"
 msgid  "connect-wifi"
+msgid  "continuous"
+msgid  "cpus"
 msgid  "create-aggr"
+msgid  "create-etherstub"
 msgid  "create-ibss"
 msgid  "create-secobj"
 msgid  "create-vlan"
+msgid  "create-vnic"
 msgid  "current"
+msgid  "default"
+msgid  "defaulted"
 msgid  "delete-aggr"
+msgid  "delete-etherstub"
 msgid  "delete-phys"
 msgid  "delete-secobj"
 msgid  "delete-vlan"
-msgid  "dev key=%d"
+msgid  "delete-vnic"
 msgid  "dev"
+msgid  "device"
 msgid  "disconnect-wifi"
-msgid  "down"
-msgid  "down-aggr"
+msgid  "dist"
+msgid  "down-vnic"
 msgid  "duplex"
 msgid  "essid"
+msgid  "expired"
+msgid  "extended"
 msgid  "fault"
 msgid  "file"
+msgid  "fixed"
+msgid  "fixed (%s)"
+msgid  "flags"
+msgid  "forcible"
 msgid  "forever"
-msgid  "full"
-msgid  "half"
 msgid  "ibssid"
 msgid  "ierrors"
 msgid  "ifspeed"
 msgid  "init-linkprop"
+msgid  "init-phys"
 msgid  "init-secobj"
 msgid  "interval"
-msgid  "invalid input"
-msgid  "ipackets64"
+msgid  "inuse"
+msgid  "ipackets"
+msgid  "ipktdist"
 msgid  "key"
 msgid  "lacp"
 msgid  "lacp-mode"
 msgid  "lacp-timer"
+msgid  "lacpactivity"
+msgid  "lacptimer"
 msgid  "link"
 msgid  "link,class,mtu,state,over"
+msgid  "link,class,over"
 msgid  "link,device,media,flags"
 msgid  "link,essid,bssid,sec,strength,mode,speed"
-msgid  "link,essid,bssid,sec,strength,mode,speed,auth,bsstype"
+msgid  "link,essid,bssid,sec,strength,mode,speed,bsstype"
 msgid  "link,ipackets,rbytes,ierrors,opackets,obytes,oerrors"
 msgid  "link,media,state,speed,duplex,device"
-msgid  "link,property,value,default,possible"
 msgid  "link,policy,addrpolicy,lacpactivity,lacptimer,flags"
-msigd  "link,port,aggregatable,sync,coll,dist,defaulted,expired"
+msgid  "link,port,aggregatable,sync,coll,dist,defaulted,expired"
 msgid  "link,port,ipackets,rbytes,opackets,obytes,ipktdist,opktdist"
 msgid  "link,port,speed,duplex,state,address,portstate"
+msgid  "link,property,value,default,possible"
+msgid  "link,ptype,state,auto,speed-duplex,pause"
+msgid  "link,ptype,state,auto,speed-duplex,pause,rem_fault"
+msgid  "link,slot,address,inuse,client"
 msgid  "link,state,speed,duplex"
-msgid  "link,vid,over,flags"
 msgid  "link,status,essid,sec,strength,mode,speed"
 msgid  "link,status,essid,sec,strength,mode,speed,auth,bssid,bsstype"
+msgid  "link,vid,over,flags"
+msgid  "link=%s"
 msgid  "link_asmpause"
 msgid  "link_autoneg"
 msgid  "link_duplex"
 msgid  "link_pause"
 msgid  "link_state"
-msgid  "long"
 msgid  "lp_cap_10"
 msgid  "lp_cap_100"
 msgid  "lp_cap_1000"
-msgid  "lp_cap_autoneg"
 msgid  "lp_cap_asmpause"
+msgid  "lp_cap_autoneg"
 msgid  "lp_cap_pause"
 msgid  "lp_rem_fault"
 msgid  "mac"
+msgid  "mac-address"
+msgid  "mac-prefix"
+msgid  "media"
 msgid  "mode"
 msgid  "modify-aggr"
-msgid  "net_rawaccess"
+msgid  "mtu"
 msgid  "no"
-msgid  "obytes64"
+msgid  "none"
+msgid  "o:px"
+msgid  "object"
+msgid  "object,class"
+msgid  "object,class,value"
+msgid  "obytes"
 msgid  "oerrors"
-msgid  "opackets	 obytes	     oerrors\n"
-msgid  "opackets64"
+msgid  "opackets"
+msgid  "opktdist"
 msgid  "output"
+msgid  "over"
 msgid  "parseable"
-msgid  "passive"
 msgid  "pause"
+msgid  "pd:si:"
 msgid  "peeradv"
 msgid  "persistent"
 msgid  "policy"
+msgid  "port"
+msgid  "portstate"
+msgid  "possible"
+msgid  "primary"
 msgid  "prop"
+msgid  "property"
+msgid  "ptype"
 msgid  "r"
-msgid  "rbytes64"
+msgid  "random"
+msgid  "rbytes"
 msgid  "rem_fault"
 msgid  "remove-aggr"
 msgid  "rename-link"
+msgid  "reset"
 msgid  "reset-linkprop"
 msgid  "root-dir"
 msgid  "scan-wifi"
 msgid  "sec"
+msgid  "set"
 msgid  "set-linkprop"
-msgid  "short"
 msgid  "show-aggr"
 msgid  "show-dev"
+msgid  "show-ether"
+msgid  "show-etherstub"
 msgid  "show-link"
+msgid  "show-linkmap"
 msgid  "show-linkprop"
 msgid  "show-phys"
 msgid  "show-secobj"
-msgid  "show-wifi"
+msgid  "show-usage"
 msgid  "show-vlan"
-msgid  "show-ether"
-msgid  "solaris.network.link.security"
+msgid  "show-vnic"
+msgid  "show-wifi"
+msgid  "slot"
 msgid  "speed"
 msgid  "speed-duplex"
-msgid  "standby"
+msgid  "state"
 msgid  "statistics"
 msgid  "status"
 msgid  "strength"
-msgid  "sys_net_config"
+msgid  "sync"
+msgid  "tR:"
+msgid  "tR:d:m:n:p:r:v:"
+msgid  "tdps:e:f:"
 msgid  "temporary"
+msgid  "timeout"
+msgid  "tx"
 msgid  "unicast"
 msgid  "unknown"
-msgid  "up"
 msgid  "up-aggr"
 msgid  "up-vlan"
+msgid  "up-vnic"
+msgid  "value"
+msgid  "vid"
 msgid  "vlan-id"
-msgid  "wep"
 msgid  "yes"
diff --git a/usr/src/cmd/dladm/vnic.conf b/usr/src/cmd/dladm/vnic.conf
new file mode 100644
index 0000000000..d156a65ec1
--- /dev/null
+++ b/usr/src/cmd/dladm/vnic.conf
@@ -0,0 +1,29 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#
+# DO NOT EDIT OR PARSE THIS FILE!
+#
+# Use the dladm(1m) command to change the contents of this file.
+
diff --git a/usr/src/cmd/flowadm/Makefile b/usr/src/cmd/flowadm/Makefile
new file mode 100644
index 0000000000..b6af8b2b79
--- /dev/null
+++ b/usr/src/cmd/flowadm/Makefile
@@ -0,0 +1,76 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+PROG=flowadm 
+
+ROOTFS_PROG= $(PROG)
+
+POFILE= $(PROG).po
+CONFIGFILES= flowadm.conf flowprop.conf
+
+include ../Makefile.cmd
+
+XGETFLAGS += -a -x $(PROG).xcl
+LDLIBS += -L$(ROOT)/lib
+LDLIBS += -ldladm -lkstat
+
+ROOTCFGDIR=	$(ROOTETC)/dladm
+ROOTCFGFILES=	$(CONFIGFILES:%=$(ROOTCFGDIR)/%)
+
+$(ROOTCFGFILES):= FILEMODE= 644
+$(ROOTCFGFILES):= OWNER= dladm
+$(ROOTCFGFILES):= GROUP= sys
+
+.KEEP_STATE:
+
+all: $(ROOTFS_PROG) 
+
+#
+# Message catalog
+#
+_msg: $(POFILE)
+
+$(POFILE): $(PROG).c
+	$(RM) $@
+	$(COMPILE.cpp) $(PROG).c > $(POFILE).i
+	$(XGETTEXT) $(XGETFLAGS) $(POFILE).i
+	sed "/^domain/d" messages.po > $@
+	$(RM) messages.po $(POFILE).i
+
+install: all $(ROOTSBINPROG) $(ROOTCFGDIR) $(ROOTCFGFILES)
+	$(RM) $(ROOTUSRSBINPROG)
+	-$(SYMLINK) ../../sbin/$(PROG) $(ROOTUSRSBINPROG)
+
+clean:
+
+lint:	lint_PROG
+
+$(ROOTCFGDIR):
+	$(INS.dir)
+
+$(ROOTCFGDIR)/%: $(ROOTCFGDIR) %
+	$(INS.file)
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/flowadm/flowadm.c b/usr/src/cmd/flowadm/flowadm.c
new file mode 100644
index 0000000000..f4c3859172
--- /dev/null
+++ b/usr/src/cmd/flowadm/flowadm.c
@@ -0,0 +1,1963 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stropts.h>
+#include <errno.h>
+#include <kstat.h>
+#include <strings.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <priv.h>
+#include <netdb.h>
+#include <libintl.h>
+#include <libdlflow.h>
+#include <libdllink.h>
+#include <libdlstat.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/ethernet.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <stddef.h>
+
+#define	CMD_TYPE_ANY	0xffffffff
+#define	STR_UNDEF_VAL	"--"
+
+
+/*
+ * data structures and routines for printing output.
+ */
+
+typedef struct print_field_s {
+	const char	*pf_name;
+	const char	*pf_header;
+	uint_t		pf_width;
+	union {
+		uint_t	_pf_index;
+		size_t	_pf_offset;
+	}_pf_un;
+#define	pf_index	_pf_un._pf_index
+#define	pf_offset	_pf_un._pf_offset;
+	uint_t	pf_cmdtype;
+} print_field_t;
+
+typedef struct print_state_s {
+	print_field_t	**ps_fields;
+	uint_t		ps_nfields;
+	boolean_t	ps_lastfield;
+	uint_t		ps_overflow;
+} print_state_t;
+
+typedef struct show_usage_state_s {
+	boolean_t	us_plot;
+	boolean_t	us_parseable;
+	boolean_t	us_printheader;
+	boolean_t	us_first;
+	print_state_t	us_print;
+} show_usage_state_t;
+
+typedef char *(*print_callback_t)(print_field_t *, void *);
+static print_field_t **parse_output_fields(char *, print_field_t *, int,
+    uint_t, uint_t *);
+
+static void print_header(print_state_t *);
+static void print_field(print_state_t *, print_field_t *, const char *,
+    boolean_t);
+
+static void flowadm_print_output(print_state_t *, boolean_t,
+    print_callback_t, void *);
+
+/*
+ * helper function that, when invoked as flowadm(print_field(pf, buf)
+ * prints string which is offset by pf->pf_offset within buf.
+ */
+static char *flowadm_print_field(print_field_t *, void *);
+
+#define	MAX_FIELD_LEN	32
+
+typedef void cmdfunc_t(int, char **);
+
+static cmdfunc_t do_add_flow, do_remove_flow, do_init_flow, do_show_flow;
+static cmdfunc_t do_show_flowprop, do_set_flowprop, do_reset_flowprop;
+static cmdfunc_t do_show_usage;
+
+static int	show_flow(dladm_flow_attr_t *, void *);
+static int	show_flows_onelink(datalink_id_t, void *);
+
+static void	flow_stats(const char *, datalink_id_t,  uint_t);
+static void	get_flow_stats(const char *, pktsum_t *);
+static int	show_flow_stats(dladm_flow_attr_t *, void *);
+static int	show_link_flow_stats(datalink_id_t, void *);
+
+static int	remove_flow(dladm_flow_attr_t *, void *);
+
+static int	show_flowprop(dladm_flow_attr_t *, void *);
+static void	show_flowprop_one_flow(void *, const char *);
+static int	show_flowprop_onelink(datalink_id_t, void *);
+
+static void	die(const char *, ...);
+static void	die_optdup(int);
+static void	die_opterr(int, int);
+static void	die_dlerr(dladm_status_t, const char *, ...);
+static void	warn(const char *, ...);
+static void	warn_dlerr(dladm_status_t, const char *, ...);
+
+typedef struct	cmd {
+	char	*c_name;
+	void	(*c_fn)(int, char **);
+} cmd_t;
+
+static cmd_t	cmds[] = {
+	{ "add-flow", do_add_flow },
+	{ "remove-flow", do_remove_flow },
+	{ "show-flowprop", do_show_flowprop },
+	{ "set-flowprop", do_set_flowprop },
+	{ "reset-flowprop", do_reset_flowprop },
+	{ "show-flow", do_show_flow },
+	{ "init-flow", do_init_flow },
+	{ "show-usage", do_show_usage }
+};
+
+static const struct option longopts[] = {
+	{"link",		required_argument,	0, 'l'},
+	{"parseable",		no_argument,		0, 'p'},
+	{"statistics",		no_argument,		0, 's'},
+	{"interval",		required_argument,	0, 'i'},
+	{"temporary",		no_argument,		0, 't'},
+	{"root-dir",		required_argument,	0, 'R'},
+	{ 0, 0, 0, 0 }
+};
+
+static const struct option prop_longopts[] = {
+	{"link",		required_argument,	0, 'l'},
+	{"temporary",		no_argument,		0, 't'},
+	{"root-dir",		required_argument,	0, 'R'},
+	{"prop",		required_argument,	0, 'p'},
+	{"attr",		required_argument,	0, 'a'},
+	{ 0, 0, 0, 0 }
+};
+
+/*
+ * structures for 'flowadm show-flow'
+ */
+
+typedef struct show_flow_state {
+	boolean_t		fs_firstonly;
+	boolean_t		fs_donefirst;
+	pktsum_t		fs_prevstats;
+	uint32_t		fs_flags;
+	dladm_status_t		fs_status;
+	print_state_t		fs_print;
+	const char		*fs_flow;
+	const char		*fs_link;
+	boolean_t		fs_parseable;
+	boolean_t		fs_printheader;
+	boolean_t		fs_persist;
+	boolean_t		fs_stats;
+	uint64_t		fs_mask;
+} show_flow_state_t;
+
+/*
+ * structures for 'flowadm remove-flow'
+ */
+
+typedef struct remove_flow_state {
+	boolean_t	fs_tempop;
+	const char	*fs_altroot;
+	dladm_status_t	fs_status;
+} remove_flow_state_t;
+
+typedef struct flow_args_s {
+	const char		*fa_link;
+	int			fa_attrno;	/* -1 indicates flow itself */
+	uint64_t		fa_mask;
+	dladm_flow_attr_t	*fa_finfop;
+	dladm_status_t		*fa_status;
+	boolean_t		fa_parseable;
+} flow_args_t;
+
+#define	PROTO_MAXSTR_LEN	7
+#define	PORT_MAXSTR_LEN		6
+#define	DSFIELD_MAXSTR_LEN	10
+
+typedef struct flow_fields_buf_s
+{
+	char flow_name[MAXNAMELEN];
+	char flow_link[MAXLINKNAMELEN];
+	char flow_ipaddr[INET6_ADDRSTRLEN+4];
+	char flow_proto[PROTO_MAXSTR_LEN];
+	char flow_port[PORT_MAXSTR_LEN];
+	char flow_dsfield[DSFIELD_MAXSTR_LEN];
+} flow_fields_buf_t;
+
+static print_field_t flow_fields[] = {
+/* name,	header,		field width,	index,		cmdtype	*/
+{  "flow",	"FLOW",		11,
+    offsetof(flow_fields_buf_t, flow_name),	CMD_TYPE_ANY},
+{  "link",	"LINK",		11,
+    offsetof(flow_fields_buf_t, flow_link),	CMD_TYPE_ANY},
+{  "ipaddr",	"IP ADDR",	30,
+    offsetof(flow_fields_buf_t, flow_ipaddr),	CMD_TYPE_ANY},
+{  "transport",	"PROTO",	6,
+    offsetof(flow_fields_buf_t, flow_proto),	CMD_TYPE_ANY},
+{  "port",	 "PORT",	7,
+    offsetof(flow_fields_buf_t, flow_port),	CMD_TYPE_ANY},
+{  "dsfield",	"DSFLD",	9,
+    offsetof(flow_fields_buf_t, flow_dsfield),	CMD_TYPE_ANY}}
+;
+
+#define	FLOW_MAX_FIELDS		(sizeof (flow_fields) / sizeof (print_field_t))
+
+/*
+ * structures for 'flowadm show-flowprop'
+ */
+typedef enum {
+	FLOWPROP_FLOW,
+	FLOWPROP_PROPERTY,
+	FLOWPROP_VALUE,
+	FLOWPROP_DEFAULT,
+	FLOWPROP_POSSIBLE
+} flowprop_field_index_t;
+
+static print_field_t flowprop_fields[] = {
+/* name,	header,		fieldwidth,	index,		cmdtype */
+{ "flow",	"FLOW",		12,	FLOWPROP_FLOW,		CMD_TYPE_ANY},
+{ "property",	"PROPERTY",	15,	FLOWPROP_PROPERTY,	CMD_TYPE_ANY},
+{ "value",	"VALUE",	14,	FLOWPROP_VALUE,		CMD_TYPE_ANY},
+{ "default",	"DEFAULT",	14,	FLOWPROP_DEFAULT,	CMD_TYPE_ANY},
+{ "possible",	"POSSIBLE",	20,	FLOWPROP_POSSIBLE,	CMD_TYPE_ANY}}
+;
+#define	FLOWPROP_MAX_FIELDS					\
+	(sizeof (flowprop_fields) / sizeof (print_field_t))
+
+#define	MAX_PROP_LINE		512
+
+typedef struct show_flowprop_state {
+	const char		*fs_flow;
+	datalink_id_t		fs_linkid;
+	char			*fs_line;
+	char			**fs_propvals;
+	dladm_arg_list_t	*fs_proplist;
+	boolean_t		fs_parseable;
+	boolean_t		fs_persist;
+	boolean_t		fs_header;
+	dladm_status_t		fs_status;
+	dladm_status_t		fs_retstatus;
+	print_state_t		fs_print;
+} show_flowprop_state_t;
+
+typedef struct set_flowprop_state {
+	const char	*fs_name;
+	boolean_t	fs_reset;
+	boolean_t	fs_temp;
+	dladm_status_t	fs_status;
+} set_flowprop_state_t;
+
+typedef struct flowprop_args_s {
+	show_flowprop_state_t	*fs_state;
+	char			*fs_propname;
+	char			*fs_flowname;
+} flowprop_args_t;
+
+/*
+ * structures for 'flow show-usage'
+ */
+
+typedef struct  usage_fields_buf_s {
+	char	usage_flow[12];
+	char	usage_duration[10];
+	char	usage_ipackets[9];
+	char	usage_rbytes[10];
+	char	usage_opackets[9];
+	char	usage_obytes[10];
+	char	usage_bandwidth[14];
+} usage_fields_buf_t;
+
+static print_field_t usage_fields[] = {
+/* name,	header,		field width,	offset,	cmdtype		*/
+{ "flow",	"FLOW",			12,
+    offsetof(usage_fields_buf_t, usage_flow),		CMD_TYPE_ANY},
+{ "duration",	"DURATION",		10,
+    offsetof(usage_fields_buf_t, usage_duration),	CMD_TYPE_ANY},
+{ "ipackets",	"IPACKETS",		9,
+    offsetof(usage_fields_buf_t, usage_ipackets),	CMD_TYPE_ANY},
+{ "rbytes",	"RBYTES",		10,
+    offsetof(usage_fields_buf_t, usage_rbytes),		CMD_TYPE_ANY},
+{ "opackets",	"OPACKETS",		9,
+    offsetof(usage_fields_buf_t, usage_opackets),	CMD_TYPE_ANY},
+{ "obytes",	"OBYTES",		10,
+    offsetof(usage_fields_buf_t, usage_obytes),		CMD_TYPE_ANY},
+{ "bandwidth",	"BANDWIDTH",		14,
+    offsetof(usage_fields_buf_t, usage_bandwidth),	CMD_TYPE_ANY}}
+;
+
+#define	USAGE_MAX_FIELDS	(sizeof (usage_fields) / sizeof (print_field_t))
+
+/*
+ * structures for 'dladm show-usage link'
+ */
+
+typedef struct  usage_l_fields_buf_s {
+	char	usage_l_flow[12];
+	char	usage_l_stime[13];
+	char	usage_l_etime[13];
+	char	usage_l_rbytes[8];
+	char	usage_l_obytes[8];
+	char	usage_l_bandwidth[14];
+} usage_l_fields_buf_t;
+
+static print_field_t usage_l_fields[] = {
+/* name,	header,		field width,	offset,	cmdtype		*/
+{ "flow",	"FLOW",		12,
+    offsetof(usage_l_fields_buf_t, usage_l_flow),	CMD_TYPE_ANY},
+{ "start",	"START",	13,
+    offsetof(usage_l_fields_buf_t, usage_l_stime),	CMD_TYPE_ANY},
+{ "end",	"END",		13,
+    offsetof(usage_l_fields_buf_t, usage_l_etime),	CMD_TYPE_ANY},
+{ "rbytes",	"RBYTES",	8,
+    offsetof(usage_l_fields_buf_t, usage_l_rbytes),	CMD_TYPE_ANY},
+{ "obytes",	"OBYTES",	8,
+    offsetof(usage_l_fields_buf_t, usage_l_obytes),	CMD_TYPE_ANY},
+{ "bandwidth",	"BANDWIDTH",	14,
+    offsetof(usage_l_fields_buf_t, usage_l_bandwidth),	CMD_TYPE_ANY}}
+;
+
+#define	USAGE_L_MAX_FIELDS \
+	(sizeof (usage_l_fields) /sizeof (print_field_t))
+
+#define	PRI_HI		100
+#define	PRI_LO 		10
+#define	PRI_NORM	50
+
+#define	FLOWADM_CONF	"/etc/dladm/flowadm.conf"
+#define	BLANK_LINE(s)	((s[0] == '\0') || (s[0] == '#') || (s[0] == '\n'))
+
+static char *progname;
+
+boolean_t		t_arg = B_FALSE; /* changes are persistent */
+char			*altroot = NULL;
+
+static const char *attr_table[] =
+	{"local_ip", "remote_ip", "transport", "local_port", "dsfield"};
+
+#define	NATTR	(sizeof (attr_table)/sizeof (char *))
+
+static void
+usage(void)
+{
+	(void) fprintf(stderr, gettext("usage: flowadm <subcommand>"
+	    " <args>...\n"
+	    "\tadd-flow [-t] [-R <root-dir>] -l <link>\n"
+	    "\t\t-a attr=value[,...] [-p prop=value,...]\n"
+	    "\t\tflow-name\n"
+	    "\tremove-flow [-t] [-R <root-dir>] {-l <link> | flow-name}\n"
+	    "\tset-flowprop [-t] [-R <root-dir>] \n"
+	    "\t\t-p prop=value[,...] flowname\n"
+	    "\treset-flowprop [-t] [-R <root-dir>] \n"
+	    "\t\t[-p prop,...] flowname\n"
+	    "\tshow-flowprop [-cP] [-l <link>] [-p prop,...] [flow-name]\n"
+	    "\tshow-flow [-p] [-s [-i <interval>]] [-l <link>] [flow-name]\n"
+	    "\tshow-usage [-d|-p -F <format>] [-s <DD/MM/YYYY,HH:MM:SS>]\n"
+	    "\t\t[-e <DD/MM/YYYY,HH:MM:SS>]] -f <logfile> [<name>]\n"));
+	exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+	int	i, arglen, cmdlen;
+	cmd_t	*cmdp;
+
+	(void) setlocale(LC_ALL, "");
+#if !defined(TEXT_DOMAIN)
+#define	TEXT_DOMAIN "SYS_TEST"
+#endif
+	(void) textdomain(TEXT_DOMAIN);
+
+	progname = argv[0];
+
+	if (argc < 2)
+		usage();
+
+	for (i = 0; i < sizeof (cmds) / sizeof (cmds[0]); i++) {
+		cmdp = &cmds[i];
+		arglen = strlen(argv[1]);
+		cmdlen = strlen(cmdp->c_name);
+		if ((arglen == cmdlen) && (strncmp(argv[1], cmdp->c_name,
+		    cmdlen) == 0)) {
+			cmdp->c_fn(argc - 1, &argv[1]);
+			exit(0);
+		}
+	}
+
+	(void) fprintf(stderr, gettext("%s: unknown subcommand '%s'\n"),
+	    progname, argv[1]);
+	usage();
+
+	return (0);
+}
+
+static const char *
+match_attr(char *attr)
+{
+	int i;
+
+	for (i = 0; i < NATTR; i++) {
+		if (strlen(attr) == strlen(attr_table[i]) &&
+		    strncmp(attr, attr_table[i], strlen(attr_table[i])) == 0) {
+			return (attr);
+		}
+	}
+	return (NULL);
+}
+
+/* ARGSUSED */
+static void
+do_init_flow(int argc, char *argv[])
+{
+	dladm_status_t status;
+
+	status = dladm_flow_init();
+	if (status != DLADM_STATUS_OK)
+		die_dlerr(status, "flows initialization failed");
+}
+
+/* ARGSUSED */
+static int
+show_usage_date(dladm_usage_t *usage, void *arg)
+{
+
+	time_t	stime;
+	char	timebuf[20];
+
+	stime = usage->du_stime;
+	(void) strftime(timebuf, sizeof (timebuf), "%m/%d/%Y",
+	    localtime(&stime));
+	(void) printf("%s\n", timebuf);
+
+	return (DLADM_STATUS_OK);
+}
+
+static int
+show_usage_time(dladm_usage_t *usage, void *arg)
+{
+	show_usage_state_t	*state = (show_usage_state_t *)arg;
+	char			buf[DLADM_STRSIZE];
+	usage_l_fields_buf_t 	ubuf;
+	time_t			time;
+	double			bw;
+
+	if (state->us_plot) {
+		if (!state->us_printheader) {
+			if (state->us_first) {
+				(void) printf("# Time");
+				state->us_first = B_FALSE;
+			}
+			(void) printf(" %s", usage->du_name);
+			if (usage->du_last) {
+				(void) printf("\n");
+				state->us_first = B_TRUE;
+				state->us_printheader = B_TRUE;
+			}
+		} else {
+			if (state->us_first) {
+				time = usage->du_etime;
+				(void) strftime(buf, sizeof (buf), "%T",
+				    localtime(&time));
+				state->us_first = B_FALSE;
+				(void) printf("%s", buf);
+			}
+			bw = (double)usage->du_bandwidth/1000;
+			(void) printf(" %.2f", bw);
+			if (usage->du_last) {
+				(void) printf("\n");
+				state->us_first = B_TRUE;
+			}
+		}
+		return (DLADM_STATUS_OK);
+	}
+
+	bzero(&ubuf, sizeof (ubuf));
+
+	(void) snprintf(ubuf.usage_l_flow, sizeof (ubuf.usage_l_flow), "%s",
+	    usage->du_name);
+	time = usage->du_stime;
+	(void) strftime(buf, sizeof (buf), "%T", localtime(&time));
+	(void) snprintf(ubuf.usage_l_stime, sizeof (ubuf.usage_l_stime), "%s",
+	    buf);
+	time = usage->du_etime;
+	(void) strftime(buf, sizeof (buf), "%T", localtime(&time));
+	(void) snprintf(ubuf.usage_l_etime, sizeof (ubuf.usage_l_etime), "%s",
+	    buf);
+	(void) snprintf(ubuf.usage_l_rbytes, sizeof (ubuf.usage_l_rbytes),
+	    "%llu", usage->du_rbytes);
+	(void) snprintf(ubuf.usage_l_obytes, sizeof (ubuf.usage_l_obytes),
+	    "%llu", usage->du_obytes);
+	(void) snprintf(ubuf.usage_l_bandwidth, sizeof (ubuf.usage_l_bandwidth),
+	    "%s Mbps", dladm_bw2str(usage->du_bandwidth, buf));
+
+	if (!state->us_parseable && !state->us_printheader) {
+		print_header(&state->us_print);
+		state->us_printheader = B_TRUE;
+	}
+
+	flowadm_print_output(&state->us_print, state->us_parseable,
+	    flowadm_print_field, (void *)&ubuf);
+
+	return (DLADM_STATUS_OK);
+}
+
+static int
+show_usage_res(dladm_usage_t *usage, void *arg)
+{
+	show_usage_state_t	*state = (show_usage_state_t *)arg;
+	char			buf[DLADM_STRSIZE];
+	usage_fields_buf_t	ubuf;
+
+	bzero(&ubuf, sizeof (ubuf));
+
+	(void) snprintf(ubuf.usage_flow, sizeof (ubuf.usage_flow), "%s",
+	    usage->du_name);
+	(void) snprintf(ubuf.usage_duration, sizeof (ubuf.usage_duration),
+	    "%llu", usage->du_duration);
+	(void) snprintf(ubuf.usage_ipackets, sizeof (ubuf.usage_ipackets),
+	    "%llu", usage->du_ipackets);
+	(void) snprintf(ubuf.usage_rbytes, sizeof (ubuf.usage_rbytes),
+	    "%llu", usage->du_rbytes);
+	(void) snprintf(ubuf.usage_opackets, sizeof (ubuf.usage_opackets),
+	    "%llu", usage->du_opackets);
+	(void) snprintf(ubuf.usage_obytes, sizeof (ubuf.usage_obytes),
+	    "%llu", usage->du_obytes);
+	(void) snprintf(ubuf.usage_bandwidth, sizeof (ubuf.usage_bandwidth),
+	    "%s Mbps", dladm_bw2str(usage->du_bandwidth, buf));
+
+	if (!state->us_parseable && !state->us_printheader) {
+		print_header(&state->us_print);
+		state->us_printheader = B_TRUE;
+	}
+
+	flowadm_print_output(&state->us_print, state->us_parseable,
+	    flowadm_print_field, (void *)&ubuf);
+
+	return (DLADM_STATUS_OK);
+}
+
+static boolean_t
+valid_formatspec(char *formatspec_str)
+{
+	if (strcmp(formatspec_str, "gnuplot") == 0)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+/* ARGSUSED */
+static void
+do_show_usage(int argc, char *argv[])
+{
+	char			*file = NULL;
+	int			opt;
+	dladm_status_t		status;
+	boolean_t		d_arg = B_FALSE;
+	boolean_t		p_arg = B_FALSE;
+	char			*stime = NULL;
+	char			*etime = NULL;
+	char			*resource = NULL;
+	show_usage_state_t	state;
+	boolean_t		o_arg = B_FALSE;
+	boolean_t		F_arg = B_FALSE;
+	char			*fields_str = NULL;
+	char			*formatspec_str = NULL;
+	print_field_t		**fields;
+	uint_t			nfields;
+	char			*all_fields =
+	    "flow,duration,ipackets,rbytes,opackets,obytes,bandwidth";
+	char			*all_l_fields =
+	    "flow,start,end,rbytes,obytes,bandwidth";
+
+	bzero(&state, sizeof (show_usage_state_t));
+	state.us_parseable = B_FALSE;
+	state.us_printheader = B_FALSE;
+	state.us_plot = B_FALSE;
+	state.us_first = B_TRUE;
+
+	while ((opt = getopt(argc, argv, "dps:e:o:f:F:")) != -1) {
+		switch (opt) {
+		case 'd':
+			d_arg = B_TRUE;
+			break;
+		case 'p':
+			state.us_plot = p_arg = B_TRUE;
+			break;
+		case 'f':
+			file = optarg;
+			break;
+		case 's':
+			stime = optarg;
+			break;
+		case 'e':
+			etime = optarg;
+			break;
+		case 'o':
+			o_arg = B_TRUE;
+			fields_str = optarg;
+			break;
+		case 'F':
+			F_arg = B_TRUE;
+			formatspec_str = optarg;
+			break;
+		default:
+			die_opterr(optopt, opt);
+		}
+	}
+
+	if (file == NULL)
+		die("show-usage requires a file");
+
+	if (optind == (argc-1)) {
+		resource = argv[optind];
+	}
+
+	if (resource == NULL && stime == NULL && etime == NULL) {
+		if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0))
+			fields_str = all_fields;
+		fields = parse_output_fields(fields_str, usage_fields,
+		    USAGE_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+	} else {
+		if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0))
+			fields_str = all_l_fields;
+		fields = parse_output_fields(fields_str, usage_l_fields,
+		    USAGE_L_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+	}
+
+	if (fields == NULL) {
+		die("invalid fields(s) specified");
+		return;
+	}
+	state.us_print.ps_fields = fields;
+	state.us_print.ps_nfields = nfields;
+
+	if (p_arg && d_arg)
+		die("plot and date options are incompatible");
+
+	if (p_arg && !F_arg)
+		die("specify format speicifier: -F <format>");
+
+	if (F_arg && valid_formatspec(formatspec_str) == B_FALSE)
+		die("Format specifier %s not supported", formatspec_str);
+
+	if (d_arg) {
+		/* Print log dates */
+		status = dladm_usage_dates(show_usage_date,
+		    DLADM_LOGTYPE_FLOW, file, resource, &state);
+	} else if (resource == NULL && stime == NULL && etime == NULL &&
+	    !p_arg) {
+		/* Print summary */
+		status = dladm_usage_summary(show_usage_res,
+		    DLADM_LOGTYPE_FLOW, file, &state);
+	} else if (resource != NULL) {
+		/* Print log entries for named resource */
+		status = dladm_walk_usage_res(show_usage_time,
+		    DLADM_LOGTYPE_FLOW, file, resource, stime, etime, &state);
+	} else {
+		/* Print time and information for each link */
+		status = dladm_walk_usage_time(show_usage_time,
+		    DLADM_LOGTYPE_FLOW, file, stime, etime, &state);
+	}
+
+	if (status != DLADM_STATUS_OK)
+		die_dlerr(status, "show-usage");
+}
+
+static void
+do_add_flow(int argc, char *argv[])
+{
+	char			devname[MAXNAMELEN];
+	char			*name = NULL;
+	uint_t			index;
+	datalink_id_t		linkid;
+
+	char			option;
+	boolean_t		l_arg = B_FALSE;
+	dladm_arg_list_t	*proplist = NULL;
+	dladm_arg_list_t	*attrlist = NULL;
+	dladm_status_t		status;
+
+	while ((option = getopt_long(argc, argv, "tR:l:a:p:",
+	    prop_longopts, NULL)) != -1) {
+		switch (option) {
+		case 't':
+			t_arg = B_TRUE;
+			break;
+		case 'R':
+			altroot = optarg;
+			break;
+		case 'l':
+			if (strlcpy(devname, optarg,
+			    MAXNAMELEN) >= MAXNAMELEN) {
+				die("link name too long");
+			}
+			if (dladm_name2info(devname, &linkid, NULL,
+			    NULL, NULL) != DLADM_STATUS_OK)
+				die("invalid link '%s'", devname);
+			l_arg = B_TRUE;
+			break;
+		case 'a':
+			if (dladm_parse_flow_attrs(optarg, &attrlist, B_FALSE)
+			    != DLADM_STATUS_OK)
+				die("invalid flow attribute specified");
+			break;
+		case 'p':
+			if (dladm_parse_flow_props(optarg, &proplist, B_FALSE)
+			    != DLADM_STATUS_OK)
+				die("invalid flow property specified");
+			break;
+		default:
+			die_opterr(optopt, option);
+		}
+	}
+	if (!l_arg) {
+		die("link is required");
+	}
+
+	opterr = 0;
+	index = optind;
+
+	if ((index != (argc - 1)) || match_attr(argv[index]) != NULL) {
+		die("flow name is required");
+	} else {
+		/* get flow name; required last argument */
+		if (strlen(argv[index]) >= MAXFLOWNAME)
+			die("flow name too long");
+		name = argv[index];
+	}
+
+	status = dladm_flow_add(linkid, attrlist, proplist, name,
+	    t_arg, altroot);
+	if (status != DLADM_STATUS_OK)
+		die_dlerr(status, "add flow failed");
+
+	dladm_free_attrs(attrlist);
+	dladm_free_props(proplist);
+}
+
+static void
+do_remove_flow(int argc, char *argv[])
+{
+	char			option;
+	char			*flowname = NULL;
+	char			linkname[MAXNAMELEN];
+	datalink_id_t		linkid = DATALINK_ALL_LINKID;
+	boolean_t		l_arg = B_FALSE;
+	remove_flow_state_t	state;
+	dladm_status_t		status;
+
+	bzero(&state, sizeof (state));
+
+	opterr = 0;
+	while ((option = getopt_long(argc, argv, ":tR:l:",
+	    longopts, NULL)) != -1) {
+		switch (option) {
+		case 't':
+			t_arg = B_TRUE;
+			break;
+		case 'R':
+			altroot = optarg;
+			break;
+		case 'l':
+			if (strlcpy(linkname, optarg,
+			    MAXLINKNAMELEN) >= MAXLINKNAMELEN) {
+				die("link name too long");
+			}
+			if (dladm_name2info(linkname, &linkid, NULL,
+			    NULL, NULL) != DLADM_STATUS_OK) {
+				die("invalid link '%s'", linkname);
+			}
+			l_arg = B_TRUE;
+			break;
+		default:
+			die_opterr(optopt, option);
+			break;
+		}
+	}
+
+	/* when link not specified get flow name */
+	if (!l_arg) {
+		if (optind != (argc-1)) {
+			usage();
+		} else {
+			if (strlen(argv[optind]) >= MAXFLOWNAME)
+				die("flow name too long");
+			flowname = argv[optind];
+		}
+		status = dladm_flow_remove(flowname, t_arg, altroot);
+	} else {
+		/* if link is specified then flow name should not be there */
+		if (optind == argc-1)
+			usage();
+		/* walk the link to find flows and remove them */
+		state.fs_tempop = t_arg;
+		state.fs_altroot = altroot;
+		state.fs_status = DLADM_STATUS_OK;
+		status = dladm_walk_flow(remove_flow, linkid, &state, B_FALSE);
+		/*
+		 * check if dladm_walk_flow terminated early and see if the
+		 * walker function as any status for us
+		 */
+		if (status == DLADM_STATUS_OK)
+			status = state.fs_status;
+	}
+
+	if (status != DLADM_STATUS_OK)
+		die_dlerr(status, "remove flow failed");
+}
+
+/*
+ * Walker function for removing a flow through dladm_walk_flow();
+ */
+static int
+remove_flow(dladm_flow_attr_t *attr, void *arg)
+{
+	remove_flow_state_t	*state = (remove_flow_state_t *)arg;
+
+	state->fs_status = dladm_flow_remove(attr->fa_flowname,
+	    state->fs_tempop, state->fs_altroot);
+
+	if (state->fs_status == DLADM_STATUS_OK)
+		return (DLADM_WALK_CONTINUE);
+	else
+		return (DLADM_WALK_TERMINATE);
+}
+
+static char *
+flowadm_print_field(print_field_t *pf, void *arg)
+{
+	char *value;
+
+	value = (char *)arg + pf->pf_offset;
+	return (value);
+}
+
+/*ARGSUSED*/
+static dladm_status_t
+print_flow(show_flow_state_t *state, dladm_flow_attr_t *attr,
+    flow_fields_buf_t *fbuf)
+{
+	char		link[MAXLINKNAMELEN];
+	dladm_status_t	status;
+
+	if ((status = dladm_datalink_id2info(attr->fa_linkid, NULL, NULL,
+	    NULL, link, sizeof (link))) != DLADM_STATUS_OK) {
+		return (status);
+	}
+
+	(void) snprintf(fbuf->flow_name, sizeof (fbuf->flow_name),
+	    "%s", attr->fa_flowname);
+	(void) snprintf(fbuf->flow_link, sizeof (fbuf->flow_link),
+	    "%s", link);
+
+	(void) dladm_flow_attr_ip2str(attr, fbuf->flow_ipaddr,
+	    sizeof (fbuf->flow_ipaddr));
+	(void) dladm_flow_attr_proto2str(attr, fbuf->flow_proto,
+	    sizeof (fbuf->flow_proto));
+	(void) dladm_flow_attr_port2str(attr, fbuf->flow_port,
+	    sizeof (fbuf->flow_port));
+	(void) dladm_flow_attr_dsfield2str(attr, fbuf->flow_dsfield,
+	    sizeof (fbuf->flow_dsfield));
+
+	return (DLADM_STATUS_OK);
+}
+
+/*
+ * Walker function for showing flow attributes through dladm_walk_flow().
+ */
+static int
+show_flow(dladm_flow_attr_t *attr, void *arg)
+{
+	show_flow_state_t	*statep = arg;
+	dladm_status_t		status;
+	flow_fields_buf_t	fbuf;
+
+	/*
+	 * first get all the flow attributes into fbuf;
+	 */
+	bzero(&fbuf, sizeof (fbuf));
+	status = print_flow(statep, attr, &fbuf);
+
+	if (status != DLADM_STATUS_OK)
+		goto done;
+
+	if (!statep->fs_parseable && !statep->fs_printheader) {
+		print_header(&statep->fs_print);
+		statep->fs_printheader = B_TRUE;
+	}
+
+	flowadm_print_output(&statep->fs_print, statep->fs_parseable,
+	    flowadm_print_field, (void *)&fbuf);
+
+done:
+	statep->fs_status = status;
+	return (DLADM_WALK_CONTINUE);
+}
+
+static void
+show_one_flow(void *arg, const char *name)
+{
+	dladm_flow_attr_t	attr;
+	dladm_status_t		status;
+
+	if (dladm_flow_info(name, &attr) != DLADM_STATUS_OK)
+		die("invalid flow: '%s'", name);
+	else
+		show_flow(&attr, arg);
+}
+
+/*
+ * Wrapper of dladm_walk_flow(show_flow,...) to make it usable to
+ * dladm_walk_datalink_id(). Used for showing flow attributes for
+ * all flows on all links.
+ */
+static int
+show_flows_onelink(datalink_id_t linkid, void *arg)
+{
+	show_flow_state_t *state = arg;
+
+	(void) dladm_walk_flow(show_flow, linkid, arg, state->fs_persist);
+
+	return (DLADM_WALK_CONTINUE);
+}
+
+static void
+get_flow_stats(const char *flowname, pktsum_t *stats)
+{
+	kstat_ctl_t	*kcp;
+	kstat_t		*ksp;
+
+	bzero(stats, sizeof (*stats));
+
+	if ((kcp = kstat_open()) == NULL) {
+		warn("kstat open operation failed");
+		return;
+	}
+
+	ksp = dladm_kstat_lookup(kcp, NULL, -1, flowname, "flow");
+
+	if (ksp != NULL)
+		dladm_get_stats(kcp, ksp, stats);
+
+	(void) kstat_close(kcp);
+}
+
+/* ARGSUSED */
+static int
+show_flow_stats(dladm_flow_attr_t *attr, void *arg)
+{
+	show_flow_state_t *state = (show_flow_state_t *)arg;
+	const char *name = attr->fa_flowname;
+	pktsum_t stats, diff_stats;
+
+	if (state->fs_firstonly) {
+		if (state->fs_donefirst)
+			return (DLADM_WALK_TERMINATE);
+		state->fs_donefirst = B_TRUE;
+	} else {
+		bzero(&state->fs_prevstats, sizeof (state->fs_prevstats));
+	}
+
+	get_flow_stats(name, &stats);
+	dladm_stats_diff(&diff_stats, &stats, &state->fs_prevstats);
+
+	(void) printf("%-12s", name);
+	(void) printf("%-10llu", diff_stats.ipackets);
+	(void) printf("%-12llu", diff_stats.rbytes);
+	(void) printf("%-8llu", diff_stats.ierrors);
+	(void) printf("%-10llu", diff_stats.opackets);
+	(void) printf("%-12llu", diff_stats.obytes);
+	(void) printf("%-8llu\n", diff_stats.oerrors);
+
+	state->fs_prevstats = stats;
+
+	return (DLADM_WALK_CONTINUE);
+}
+
+/*
+ * Wrapper of dladm_walk_flow(show_flow,...) to make it usable for
+ * dladm_walk_datalink_id(). Used for showing flow stats for
+ * all flows on all links.
+ */
+static int
+show_link_flow_stats(datalink_id_t linkid, void * arg)
+{
+	if (dladm_walk_flow(show_flow_stats, linkid, arg, B_FALSE)
+	    == DLADM_STATUS_OK)
+		return (DLADM_WALK_CONTINUE);
+	else
+		return (DLADM_WALK_TERMINATE);
+}
+
+/* ARGSUSED */
+static void
+flow_stats(const char *flow, datalink_id_t linkid,  uint_t interval)
+{
+	show_flow_state_t	state;
+	dladm_flow_attr_t	attr;
+
+	if (flow != NULL && dladm_flow_info(flow, &attr) != DLADM_STATUS_OK)
+		die("invalid flow %s", flow);
+
+	bzero(&state, sizeof (state));
+
+	/*
+	 * If an interval is specified, continuously show the stats
+	 * for only the first flow.
+	 */
+	state.fs_firstonly = (interval != 0);
+
+	for (;;) {
+		if (!state.fs_donefirst)
+			(void) printf("%-12s%-10s%-12s%-8s%-10s%-12s%-8s\n",
+			    "FLOW", "IPACKETS", "RBYTES", "IERRORS",
+			    "OPACKETS", "OBYTES", "OERRORS");
+
+		state.fs_donefirst = B_FALSE;
+
+		/* Show stats for named flow */
+		if (flow != NULL)  {
+			state.fs_flow = flow;
+			(void) show_flow_stats(&attr, &state);
+
+		/* Show all stats on a link */
+		} else if (linkid != DATALINK_INVALID_LINKID) {
+			(void) dladm_walk_flow(show_flow_stats, linkid, &state,
+			    B_FALSE);
+
+		/* Show all stats by datalink */
+		} else {
+			(void) dladm_walk_datalink_id(show_link_flow_stats,
+			    &state, DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE,
+			    DLADM_OPT_ACTIVE);
+		}
+
+		if (interval == 0)
+			break;
+
+		(void) sleep(interval);
+	}
+}
+
+static void
+do_show_flow(int argc, char *argv[])
+{
+	char			flowname[MAXFLOWNAME];
+	char			linkname[MAXNAMELEN];
+	datalink_id_t		linkid = DATALINK_ALL_LINKID;
+	int			option;
+	boolean_t		s_arg = B_FALSE;
+	boolean_t		S_arg = B_FALSE;
+	boolean_t		i_arg = B_FALSE;
+	boolean_t		l_arg = B_FALSE;
+	boolean_t		o_arg = B_FALSE;
+	uint32_t		interval = 0;
+	char			*endp = NULL;
+	show_flow_state_t	state;
+	char			*fields_str = NULL;
+	print_field_t		**fields;
+	uint_t			nfields;
+	char			*all_fields =
+	    "flow,link,ipaddr,transport,port,dsfield";
+	dladm_status_t		status;
+
+	bzero(&state, sizeof (state));
+
+	opterr = 0;
+	while ((option = getopt_long(argc, argv, ":pPsSi:l:o:",
+	    longopts, NULL)) != -1) {
+		switch (option) {
+		case 'p':
+			state.fs_parseable = B_TRUE;
+			break;
+		case 'P':
+			state.fs_persist = B_TRUE;
+			break;
+		case 's':
+			if (s_arg)
+				die_optdup(option);
+
+			s_arg = B_TRUE;
+			break;
+		case 'S':
+			if (S_arg)
+				die_optdup(option);
+
+			S_arg = B_TRUE;
+			break;
+		case 'o':
+			if (o_arg)
+				die_optdup(option);
+
+			o_arg = B_TRUE;
+			fields_str = optarg;
+			break;
+		case 'i':
+			if (i_arg)
+				die_optdup(option);
+
+			i_arg = B_TRUE;
+
+			errno = 0;
+			interval = (int)strtol(optarg, &endp, 10);
+			if (errno != 0 || interval == 0 || *endp != '\0')
+				die("invalid interval value" " '%d'\n",
+				    interval);
+			break;
+		case 'l':
+			if (strlcpy(linkname, optarg, MAXLINKNAMELEN)
+			    >= MAXLINKNAMELEN)
+				die("link name too long\n");
+			if (dladm_name2info(linkname, &linkid, NULL,
+			    NULL, NULL) != DLADM_STATUS_OK)
+				die("invalid link '%s'", linkname);
+			l_arg = B_TRUE;
+			break;
+		default:
+			die_opterr(optopt, option);
+			break;
+		}
+	}
+	if (i_arg && !(s_arg || S_arg))
+		die("the -i option can be used only with -s or -S");
+
+	if (s_arg && S_arg)
+		die("the -s option cannot be used with -S");
+
+	/* get flow name (optional last argument */
+	if (optind == (argc-1)) {
+		if (strlcpy(flowname, argv[optind], MAXFLOWNAME)
+		    >= MAXFLOWNAME)
+			die("flow name too long");
+		state.fs_flow = flowname;
+	}
+
+	if (s_arg) {
+		flow_stats(state.fs_flow, linkid, interval);
+		return;
+	}
+
+	if (S_arg) {
+		dladm_continuous(linkid, state.fs_flow, interval, FLOW_REPORT);
+		return;
+	}
+
+	if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0))
+		fields_str = all_fields;
+
+	fields = parse_output_fields(fields_str, flow_fields, FLOW_MAX_FIELDS,
+	    CMD_TYPE_ANY, &nfields);
+
+	if (fields == NULL) {
+		die("invalid fields(s) specified");
+		return;
+	}
+
+	state.fs_print.ps_fields = fields;
+	state.fs_print.ps_nfields = nfields;
+
+	/* Show attributes of one flow */
+	if (state.fs_flow != NULL) {
+		show_one_flow(&state, state.fs_flow);
+
+	/* Show attributes of flows on one link */
+	} else if (l_arg) {
+		(void) show_flows_onelink(linkid, &state);
+
+	/* Show attributes of all flows on all links */
+	} else {
+		(void) dladm_walk_datalink_id(show_flows_onelink, &state,
+		    DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE,
+		    DLADM_OPT_ACTIVE);
+	}
+}
+
+static dladm_status_t
+set_flowprop_persist(const char *flow, const char *prop_name, char **prop_val,
+    uint_t val_cnt, boolean_t reset)
+{
+	dladm_status_t	status;
+	char		*errprop;
+
+	status = dladm_set_flowprop(flow, prop_name, prop_val, val_cnt,
+	    DLADM_OPT_PERSIST, &errprop);
+
+	if (status != DLADM_STATUS_OK) {
+		warn_dlerr(status, "cannot persistently %s flow "
+		    "property '%s' on '%s'", reset? "reset": "set",
+		    errprop, flow);
+	}
+	return (status);
+}
+
+static void
+set_flowprop(int argc, char **argv, boolean_t reset)
+{
+	int		i, option;
+	char		errmsg[DLADM_STRSIZE];
+	const char	*flow = NULL;
+	dladm_arg_list_t	*proplist = NULL;
+	boolean_t	temp = B_FALSE;
+	dladm_status_t	status = DLADM_STATUS_OK;
+
+	opterr = 0;
+	while ((option = getopt_long(argc, argv, ":p:R:t",
+	    prop_longopts, NULL)) != -1) {
+		switch (option) {
+		case 'p':
+			if (dladm_parse_flow_props(optarg, &proplist, reset)
+			    != DLADM_STATUS_OK)
+				die("invalid flow property specified");
+			break;
+		case 't':
+			temp = B_TRUE;
+			break;
+		case 'R':
+			status = dladm_set_rootdir(optarg);
+			if (status != DLADM_STATUS_OK) {
+				die_dlerr(status, "invalid directory "
+				    "specified");
+			}
+			break;
+		default:
+			die_opterr(optopt, option);
+			break;
+		}
+	}
+
+	if (optind == (argc - 1)) {
+		if (strlen(argv[optind]) >= MAXFLOWNAME)
+			die("flow name too long");
+		flow = argv[optind];
+	} else if (optind != argc) {
+		usage();
+	}
+	if (flow == NULL)
+		die("flow name must be specified");
+
+	if (proplist == NULL) {
+		char *errprop;
+
+		if (!reset)
+			die("flow property must be specified");
+
+		status = dladm_set_flowprop(flow, NULL, NULL, 0,
+		    DLADM_OPT_ACTIVE, &errprop);
+		if (status != DLADM_STATUS_OK) {
+			warn_dlerr(status, "cannot reset flow property '%s' "
+			    "on '%s'", errprop, flow);
+		}
+		if (!temp) {
+			dladm_status_t	s;
+
+			s = set_flowprop_persist(flow, NULL, NULL, 0, reset);
+			if (s != DLADM_STATUS_OK)
+				status = s;
+		}
+		goto done;
+	}
+
+	for (i = 0; i < proplist->al_count; i++) {
+		dladm_arg_info_t	*aip = &proplist->al_info[i];
+		char		**val;
+		uint_t		count;
+		dladm_status_t	s;
+
+		if (reset) {
+			val = NULL;
+			count = 0;
+		} else {
+			val = aip->ai_val;
+			count = aip->ai_count;
+			if (count == 0) {
+				warn("no value specified for '%s'",
+				    aip->ai_name);
+				status = DLADM_STATUS_BADARG;
+				continue;
+			}
+		}
+		s = dladm_set_flowprop(flow, aip->ai_name, val, count,
+		    DLADM_OPT_ACTIVE, NULL);
+		if (s == DLADM_STATUS_OK) {
+			if (!temp) {
+				s = set_flowprop_persist(flow,
+				    aip->ai_name, val, count, reset);
+				if (s != DLADM_STATUS_OK)
+					status = s;
+			}
+			continue;
+		}
+		status = s;
+		switch (s) {
+		case DLADM_STATUS_NOTFOUND:
+			warn("invalid flow property '%s'", aip->ai_name);
+			break;
+		case DLADM_STATUS_BADVAL: {
+			int		j;
+			char		*ptr, *lim;
+			char		**propvals = NULL;
+			uint_t		valcnt = DLADM_MAX_PROP_VALCNT;
+
+			ptr = malloc((sizeof (char *) +
+			    DLADM_PROP_VAL_MAX) * DLADM_MAX_PROP_VALCNT +
+			    MAX_PROP_LINE);
+
+			if (ptr == NULL)
+				die("insufficient memory");
+			propvals = (char **)(void *)ptr;
+
+			for (j = 0; j < DLADM_MAX_PROP_VALCNT; j++) {
+				propvals[j] = ptr + sizeof (char *) *
+				    DLADM_MAX_PROP_VALCNT +
+				    j * DLADM_PROP_VAL_MAX;
+			}
+			s = dladm_get_flowprop(flow, DLADM_PROP_VAL_MODIFIABLE,
+			    aip->ai_name, propvals, &valcnt);
+
+			ptr = errmsg;
+			lim = ptr + DLADM_STRSIZE;
+			*ptr = '\0';
+			for (j = 0; j < valcnt && s == DLADM_STATUS_OK; j++) {
+				ptr += snprintf(ptr, lim - ptr, "%s,",
+				    propvals[j]);
+				if (ptr >= lim)
+					break;
+			}
+			if (ptr > errmsg) {
+				*(ptr - 1) = '\0';
+				warn("flow property '%s' must be one of: %s",
+				    aip->ai_name, errmsg);
+			} else
+				warn("%s is an invalid value for "
+				    "flow property %s", *val, aip->ai_name);
+			free(propvals);
+			break;
+		}
+		default:
+			if (reset) {
+				warn_dlerr(status, "cannot reset flow property "
+				    "'%s' on '%s'", aip->ai_name, flow);
+			} else {
+				warn_dlerr(status, "cannot set flow property "
+				    "'%s' on '%s'", aip->ai_name, flow);
+			}
+			break;
+		}
+	}
+done:
+	dladm_free_props(proplist);
+	if (status != DLADM_STATUS_OK)
+		exit(1);
+}
+
+static void
+do_set_flowprop(int argc, char **argv)
+{
+	set_flowprop(argc, argv, B_FALSE);
+}
+
+static void
+do_reset_flowprop(int argc, char **argv)
+{
+	set_flowprop(argc, argv, B_TRUE);
+}
+
+static void
+warn(const char *format, ...)
+{
+	va_list alist;
+
+	format = gettext(format);
+	(void) fprintf(stderr, "%s: warning: ", progname);
+
+	va_start(alist, format);
+	(void) vfprintf(stderr, format, alist);
+	va_end(alist);
+
+	(void) putchar('\n');
+}
+
+/* PRINTFLIKE2 */
+static void
+warn_dlerr(dladm_status_t err, const char *format, ...)
+{
+	va_list alist;
+	char    errmsg[DLADM_STRSIZE];
+
+	format = gettext(format);
+	(void) fprintf(stderr, gettext("%s: warning: "), progname);
+
+	va_start(alist, format);
+	(void) vfprintf(stderr, format, alist);
+	va_end(alist);
+	(void) fprintf(stderr, ": %s\n", dladm_status2str(err, errmsg));
+}
+
+/* PRINTFLIKE1 */
+static void
+die(const char *format, ...)
+{
+	va_list alist;
+
+	format = gettext(format);
+	(void) fprintf(stderr, "%s: ", progname);
+
+	va_start(alist, format);
+	(void) vfprintf(stderr, format, alist);
+	va_end(alist);
+
+	(void) putchar('\n');
+	exit(EXIT_FAILURE);
+}
+
+static void
+die_optdup(int opt)
+{
+	die("the option -%c cannot be specified more than once", opt);
+}
+
+static void
+die_opterr(int opt, int opterr)
+{
+	switch (opterr) {
+	case ':':
+		die("option '-%c' requires a value", opt);
+		break;
+	case '?':
+	default:
+		die("unrecognized option '-%c'", opt);
+		break;
+	}
+}
+
+/* PRINTFLIKE2 */
+static void
+die_dlerr(dladm_status_t err, const char *format, ...)
+{
+	va_list alist;
+	char	errmsg[DLADM_STRSIZE];
+
+	format = gettext(format);
+	(void) fprintf(stderr, "%s: ", progname);
+
+	va_start(alist, format);
+	(void) vfprintf(stderr, format, alist);
+	va_end(alist);
+	(void) fprintf(stderr, ": %s\n", dladm_status2str(err, errmsg));
+
+	exit(EXIT_FAILURE);
+}
+
+static void
+print_flowprop(const char *flowname, show_flowprop_state_t *statep,
+    const char *propname, dladm_prop_type_t type,
+    const char *format, char **pptr)
+{
+	int		i;
+	char		*ptr, *lim;
+	char		buf[DLADM_STRSIZE];
+	char		*unknown = "--", *notsup = "";
+	char		**propvals = statep->fs_propvals;
+	uint_t		valcnt = DLADM_MAX_PROP_VALCNT;
+	dladm_status_t	status;
+
+	status = dladm_get_flowprop(flowname, type, propname, propvals,
+	    &valcnt);
+	if (status != DLADM_STATUS_OK) {
+		if (status == DLADM_STATUS_TEMPONLY) {
+			if (type == DLADM_PROP_VAL_MODIFIABLE &&
+			    statep->fs_persist) {
+				valcnt = 1;
+				propvals = &unknown;
+			} else {
+				statep->fs_status = status;
+				statep->fs_retstatus = status;
+				return;
+			}
+		} else if (status == DLADM_STATUS_NOTSUP ||
+		    statep->fs_persist) {
+			valcnt = 1;
+			if (type == DLADM_PROP_VAL_CURRENT)
+				propvals = &unknown;
+			else
+				propvals = &notsup;
+		} else {
+			if ((statep->fs_proplist != NULL) &&
+			    statep->fs_status == DLADM_STATUS_OK) {
+				warn("invalid flow property '%s'", propname);
+			}
+			statep->fs_status = status;
+			statep->fs_retstatus = status;
+			return;
+		}
+	}
+
+	statep->fs_status = DLADM_STATUS_OK;
+
+	ptr = buf;
+	lim = buf + DLADM_STRSIZE;
+	for (i = 0; i < valcnt; i++) {
+		if (propvals[i][0] == '\0' && !statep->fs_parseable)
+			ptr += snprintf(ptr, lim - ptr, STR_UNDEF_VAL",");
+		else
+			ptr += snprintf(ptr, lim - ptr, "%s,", propvals[i]);
+		if (ptr >= lim)
+			break;
+	}
+	if (valcnt > 0)
+		buf[strlen(buf) - 1] = '\0';
+
+	lim = statep->fs_line + MAX_PROP_LINE;
+	if (statep->fs_parseable) {
+		*pptr += snprintf(*pptr, lim - *pptr,
+		    "%s", buf);
+	} else {
+		*pptr += snprintf(*pptr, lim - *pptr, format, buf);
+	}
+}
+
+static char *
+flowprop_callback(print_field_t *pf, void *fs_arg)
+{
+	flowprop_args_t		*arg = fs_arg;
+	char 			*propname = arg->fs_propname;
+	show_flowprop_state_t	*statep = arg->fs_state;
+	char			*ptr = statep->fs_line;
+	char			*lim = ptr + MAX_PROP_LINE;
+	char			*flowname = arg->fs_flowname;
+
+	switch (pf->pf_index) {
+	case FLOWPROP_FLOW:
+		(void) snprintf(ptr, lim - ptr, "%s", statep->fs_flow);
+		break;
+	case FLOWPROP_PROPERTY:
+		(void) snprintf(ptr, lim - ptr, "%s", propname);
+		break;
+	case FLOWPROP_VALUE:
+		print_flowprop(flowname, statep, propname,
+		    statep->fs_persist ? DLADM_PROP_VAL_PERSISTENT :
+		    DLADM_PROP_VAL_CURRENT, "%s", &ptr);
+		/*
+		 * If we failed to query the flow property, for example, query
+		 * the persistent value of a non-persistable flow property,
+		 * simply skip the output.
+		 */
+		if (statep->fs_status != DLADM_STATUS_OK)
+			goto skip;
+		ptr = statep->fs_line;
+		break;
+	case FLOWPROP_DEFAULT:
+		print_flowprop(flowname, statep, propname,
+		    DLADM_PROP_VAL_DEFAULT, "%s", &ptr);
+		if (statep->fs_status != DLADM_STATUS_OK)
+			goto skip;
+		ptr = statep->fs_line;
+		break;
+	case FLOWPROP_POSSIBLE:
+		print_flowprop(flowname, statep, propname,
+		    DLADM_PROP_VAL_MODIFIABLE, "%s ", &ptr);
+		if (statep->fs_status != DLADM_STATUS_OK)
+			goto skip;
+		ptr = statep->fs_line;
+		break;
+	default:
+		die("invalid input");
+		break;
+	}
+	return (ptr);
+skip:
+	if (statep->fs_status != DLADM_STATUS_OK)
+		return (NULL);
+	else
+		return ("");
+}
+
+static int
+show_one_flowprop(void *arg, const char *propname)
+{
+	show_flowprop_state_t	*statep = arg;
+	flowprop_args_t		fs_arg;
+
+	bzero(&fs_arg, sizeof (fs_arg));
+	fs_arg.fs_state = statep;
+	fs_arg.fs_propname = (char *)propname;
+	fs_arg.fs_flowname = (char *)statep->fs_flow;
+
+	if (statep->fs_header) {
+		statep->fs_header = B_FALSE;
+		if (!statep ->fs_parseable)
+			print_header(&statep->fs_print);
+	}
+	flowadm_print_output(&statep->fs_print, statep->fs_parseable,
+	    flowprop_callback, (void *)&fs_arg);
+
+	return (DLADM_WALK_CONTINUE);
+}
+
+/* Walker function called by dladm_walk_flow to display flow properties */
+static int
+show_flowprop(dladm_flow_attr_t *attr, void *arg)
+{
+	show_flowprop_one_flow(arg, attr->fa_flowname);
+	return (DLADM_WALK_CONTINUE);
+}
+
+/*
+ * Wrapper of dladm_walk_flow(show_walk_fn,...) to make it
+ * usable to dladm_walk_datalink_id()
+ */
+static int
+show_flowprop_onelink(datalink_id_t linkid, void *arg)
+{
+	char	name[MAXLINKNAMELEN];
+
+	if (dladm_datalink_id2info(linkid, NULL, NULL, NULL,
+	    name, sizeof (name)) != DLADM_STATUS_OK)
+		return (DLADM_WALK_TERMINATE);
+
+	(void) dladm_walk_flow(show_flowprop, linkid, arg, B_FALSE);
+
+	return (DLADM_WALK_CONTINUE);
+}
+
+static void
+do_show_flowprop(int argc, char **argv)
+{
+	int			option;
+	dladm_arg_list_t	*proplist = NULL;
+	show_flowprop_state_t	state;
+	char			*fields_str = NULL;
+	print_field_t		**fields;
+	uint_t			nfields;
+	char			*all_fields =
+	    "flow,property,value,default,possible";
+
+	fields_str = all_fields;
+	opterr = 0;
+	state.fs_propvals = NULL;
+	state.fs_line = NULL;
+	state.fs_parseable = B_FALSE;
+	state.fs_persist = B_FALSE;
+	state.fs_header = B_TRUE;
+	state.fs_retstatus = DLADM_STATUS_OK;
+	state.fs_linkid = DATALINK_INVALID_LINKID;
+	state.fs_flow = NULL;
+
+	while ((option = getopt_long(argc, argv, ":p:cPl:o:",
+	    prop_longopts, NULL)) != -1) {
+		switch (option) {
+		case 'p':
+			if (dladm_parse_flow_props(optarg, &proplist, B_TRUE)
+			    != DLADM_STATUS_OK)
+				die("invalid flow properties specified");
+			break;
+		case 'c':
+			state.fs_parseable = B_TRUE;
+			break;
+		case 'P':
+			state.fs_persist = B_TRUE;
+			break;
+		case 'l':
+			if (dladm_name2info(optarg, &state.fs_linkid,
+			    NULL, NULL, NULL) != DLADM_STATUS_OK)
+				die("invalid link '%s'", optarg);
+			break;
+		case 'o':
+			if (strcasecmp(optarg, "all") == 0)
+				fields_str = all_fields;
+			else
+				fields_str = optarg;
+			break;
+		default:
+			die_opterr(optopt, option);
+			break;
+		}
+	}
+
+	if (optind == (argc - 1)) {
+		if (strlen(argv[optind]) >= MAXFLOWNAME)
+			die("flow name too long");
+		state.fs_flow = argv[optind];
+	} else if (optind != argc) {
+		usage();
+	}
+	bzero(&state.fs_print, sizeof (print_state_t));
+	state.fs_proplist = proplist;
+	state.fs_status = DLADM_STATUS_OK;
+
+	fields = parse_output_fields(fields_str, flowprop_fields,
+	    FLOWPROP_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+
+	if (fields == NULL) {
+		die("invalid field(s) specified");
+		return;
+	}
+
+	state.fs_print.ps_fields = fields;
+	state.fs_print.ps_nfields = nfields;
+
+	/* Show properties for one flow */
+	if (state.fs_flow != NULL) {
+		show_flowprop_one_flow(&state, state.fs_flow);
+
+	/* Show properties for all flows on one link */
+	} else if (state.fs_linkid != DATALINK_INVALID_LINKID) {
+		(void) show_flowprop_onelink(state.fs_linkid, &state);
+
+	/* Show properties for all flows on all links */
+	} else {
+		(void) dladm_walk_datalink_id(show_flowprop_onelink, &state,
+		    DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE,
+		    DLADM_OPT_ACTIVE);
+	}
+
+	dladm_free_props(proplist);
+}
+
+static void
+show_flowprop_one_flow(void *arg, const char *flow)
+{
+	int			i;
+	char			*buf;
+	dladm_status_t		status;
+	dladm_arg_list_t	*proplist = NULL;
+	show_flowprop_state_t	*statep = arg;
+	dladm_flow_attr_t	attr;
+	const char		*savep;
+
+	/*
+	 * Do not print flow props for invalid flows.
+	 */
+	if ((status = dladm_flow_info(flow, &attr)) != DLADM_STATUS_OK) {
+		die("invalid flow: '%s'", flow);
+	}
+
+	savep = statep->fs_flow;
+	statep->fs_flow = flow;
+
+	proplist = statep->fs_proplist;
+
+	buf = malloc((sizeof (char *) + DLADM_PROP_VAL_MAX)
+	    * DLADM_MAX_PROP_VALCNT + MAX_PROP_LINE);
+	if (buf == NULL)
+		die("insufficient memory");
+
+	statep->fs_propvals = (char **)(void *)buf;
+	for (i = 0; i < DLADM_MAX_PROP_VALCNT; i++) {
+		statep->fs_propvals[i] = buf +
+		    sizeof (char *) * DLADM_MAX_PROP_VALCNT +
+		    i * DLADM_PROP_VAL_MAX;
+	}
+	statep->fs_line = buf +
+	    (sizeof (char *) + DLADM_PROP_VAL_MAX) * DLADM_MAX_PROP_VALCNT;
+
+	/* show only specified flow properties */
+	if (proplist != NULL) {
+		for (i = 0; i < proplist->al_count; i++) {
+			if (show_one_flowprop(statep,
+			    proplist->al_info[i].ai_name) != DLADM_STATUS_OK)
+				break;
+		}
+
+	/* show all flow properties */
+	} else {
+		status = dladm_walk_flowprop(show_one_flowprop, flow, statep);
+		if (status != DLADM_STATUS_OK)
+			die_dlerr(status, "show-flowprop");
+	}
+	free(buf);
+	statep->fs_flow = savep;
+}
+
+typedef struct {
+	char	*s_buf;
+	char	**s_fields;	/* array of pointer to the fields in s_buf */
+	uint_t	s_nfields;	/* the number of fields in s_buf */
+} split_t;
+
+/*
+ * Free the split_t structure pointed to by `sp'.
+ */
+static void
+splitfree(split_t *sp)
+{
+	free(sp->s_buf);
+	free(sp->s_fields);
+	free(sp);
+}
+
+/*
+ * Split `str' into at most `maxfields' fields, each field at most `maxlen' in
+ * length.  Return a pointer to a split_t containing the split fields, or NULL
+ * on failure.
+ */
+static split_t *
+split(const char *str, uint_t maxfields, uint_t maxlen)
+{
+	char	*field, *token, *lasts = NULL;
+	split_t	*sp;
+
+	if (*str == '\0' || maxfields == 0 || maxlen == 0)
+		return (NULL);
+
+	sp = calloc(sizeof (split_t), 1);
+	if (sp == NULL)
+		return (NULL);
+
+	sp->s_buf = strdup(str);
+	sp->s_fields = malloc(sizeof (char *) * maxfields);
+	if (sp->s_buf == NULL || sp->s_fields == NULL)
+		goto fail;
+
+	token = sp->s_buf;
+	while ((field = strtok_r(token, ",", &lasts)) != NULL) {
+		if (sp->s_nfields == maxfields || strlen(field) > maxlen)
+			goto fail;
+		token = NULL;
+		sp->s_fields[sp->s_nfields++] = field;
+	}
+	return (sp);
+fail:
+	splitfree(sp);
+	return (NULL);
+}
+
+static print_field_t **
+parse_output_fields(char *str, print_field_t *template, int max_fields,
+    uint_t cmdtype, uint_t *countp)
+{
+	split_t		*sp;
+	boolean_t	good_match = B_FALSE;
+	uint_t		i, j;
+	print_field_t	**pf = NULL;
+
+	sp = split(str, max_fields, MAX_FIELD_LEN);
+
+	if (sp == NULL)
+		return (NULL);
+
+	pf = malloc(sp->s_nfields * sizeof (print_field_t *));
+	if (pf == NULL)
+		goto fail;
+
+	for (i = 0; i < sp->s_nfields; i++) {
+		for (j = 0; j < max_fields; j++) {
+			if (strcasecmp(sp->s_fields[i],
+			    template[j].pf_name) == 0) {
+				good_match = template[j]. pf_cmdtype & cmdtype;
+				break;
+			}
+		}
+		if (!good_match)
+			goto fail;
+
+		good_match = B_FALSE;
+		pf[i] = &template[j];
+	}
+	*countp = i;
+	splitfree(sp);
+	return (pf);
+fail:
+	free(pf);
+	splitfree(sp);
+	return (NULL);
+}
+
+static void
+flowadm_print_output(print_state_t *statep, boolean_t parseable,
+    print_callback_t fn, void *arg)
+{
+	int i;
+	char *value;
+	print_field_t **pf;
+
+	pf = statep->ps_fields;
+	for (i = 0; i < statep->ps_nfields; i++) {
+		statep->ps_lastfield = (i + 1 == statep->ps_nfields);
+		value = (*fn)(pf[i], arg);
+		if (value != NULL)
+			print_field(statep, pf[i], value, parseable);
+	}
+	(void) putchar('\n');
+}
+
+static void
+print_header(print_state_t *ps)
+{
+	int i;
+	print_field_t **pf;
+
+	pf = ps->ps_fields;
+	for (i = 0; i < ps->ps_nfields; i++) {
+		ps->ps_lastfield = (i + 1 == ps->ps_nfields);
+		print_field(ps, pf[i], pf[i]->pf_header, B_FALSE);
+	}
+	(void) putchar('\n');
+}
+
+static void
+print_field(print_state_t *statep, print_field_t *pfp, const char *value,
+    boolean_t parseable)
+{
+	uint_t	width = pfp->pf_width;
+	uint_t	valwidth = strlen(value);
+	uint_t	compress;
+
+	if (parseable) {
+		(void) printf("%s=\"%s\"", pfp->pf_header, value);
+	} else {
+		if (value[0] == '\0')
+			value = STR_UNDEF_VAL;
+		if (statep->ps_lastfield) {
+			(void) printf("%s", value);
+			return;
+		}
+
+		if (valwidth > width) {
+			statep->ps_overflow += valwidth - width;
+		} else if (valwidth < width && statep->ps_overflow > 0) {
+			compress = min(statep->ps_overflow, width - valwidth);
+			statep->ps_overflow -= compress;
+			width -= compress;
+		}
+		(void) printf("%-*s", width, value);
+	}
+
+	if (!statep->ps_lastfield)
+		(void) putchar(' ');
+}
diff --git a/usr/src/cmd/flowadm/flowadm.conf b/usr/src/cmd/flowadm/flowadm.conf
new file mode 100644
index 0000000000..3977ddf645
--- /dev/null
+++ b/usr/src/cmd/flowadm/flowadm.conf
@@ -0,0 +1,28 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# DO NOT EDIT OR PARSE THIS FILE!
+#
+# Use the flowadm(1m) command to change the contents of this file.
+
diff --git a/usr/src/cmd/flowadm/flowadm.xcl b/usr/src/cmd/flowadm/flowadm.xcl
new file mode 100644
index 0000000000..856a788ed6
--- /dev/null
+++ b/usr/src/cmd/flowadm/flowadm.xcl
@@ -0,0 +1,113 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#
+
+msgid  "--"
+msgid  "--,"
+msgid  ""
+msgid  " "
+msgid  "%-*s"
+msgid  "%-10llu"
+msgid  "%-12llu"
+msgid  "%-12s"
+msgid  "%-12s%-10s%-12s%-8s%-10s%-12s%-8s\n"
+msgid  "%-8llu"
+msgid  "%-8llu\n"
+msgid  "%d"
+msgid  "%s"
+msgid  "%s "
+msgid  "%s,"
+msgid  "%s/%d  "
+msgid  "%s: "
+msgid  "%s=\"%s\""
+msgid  ","
+msgid  "/"
+msgid  "0x%x"
+msgid  ": %s\n"
+msgid  ":d:R:t"
+msgid  ":p:R:t"
+msgid  ":p:cPl:o:"
+msgid  "?"
+msgid  "ATTR"
+msgid  "DEFAULT"
+msgid  "FLOW"
+msgid  "ICMPV6"
+msgid  "ICMPv6"
+msgid  "IERRORS"
+msgid  "IPACKETS"
+msgid  "LINK"
+msgid  "NAME"
+msgid  "OBYTES"
+msgid  "OERRORS"
+msgid  "OPACKETS"
+msgid  "POSSIBLE"
+msgid  "PROPERTY"
+msgid  "RBYTES"
+msgid  "SCTP"
+msgid  "TCP"
+msgid  "UDP"
+msgid  "VALUE"
+msgid  "add-flow"
+msgid  "all"
+msgid  "attr"
+msgid  "default"
+msgid  "dsfield"
+msgid  "dsfield_mask"
+msgid  "flow"
+msgid  "flow,property,value,default,possible"
+msgid  "icmp"
+msgid  "icmpv6"
+msgid  "init-flow"
+msgid  "interval"
+msgid  "link"
+msgid  "local_ip"
+msgid  "local_port"
+msgid  "name"
+msgid  "name,link,attr,value"
+msgid  "net_rawaccess"
+msgid  "parseable"
+msgid  "possible"
+msgid  "prop"
+msgid  "property"
+msgid  "psSi:l:o:"
+msgid  "remote_ip"
+msgid  "remove-flow"
+msgid  "reset"
+msgid  "reset-flowprop"
+msgid  "root-dir"
+msgid  "sctp"
+msgid  "set"
+msgid  "set-flowprop"
+msgid  "show-flow"
+msgid  "show-flowprop"
+msgid  "show-usage"
+msgid  "statistics"
+msgid  "sys_net_config"
+msgid  "tR:l:a:p:"
+msgid  "tcp"
+msgid  "tdps:e:f:"
+msgid  "temporary"
+msgid  "transport"
+msgid  "udp"
+msgid  "value"
diff --git a/usr/src/cmd/flowadm/flowprop.conf b/usr/src/cmd/flowadm/flowprop.conf
new file mode 100644
index 0000000000..ad6f802040
--- /dev/null
+++ b/usr/src/cmd/flowadm/flowprop.conf
@@ -0,0 +1,29 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#
+# DO NOT EDIT OR PARSE THIS FILE!
+#
+# Use the flowadm(1m) command to change the contents of this file.
+
diff --git a/usr/src/cmd/mdb/Makefile.common b/usr/src/cmd/mdb/Makefile.common
index 5677289bc9..ed27426b8d 100644
--- a/usr/src/cmd/mdb/Makefile.common
+++ b/usr/src/cmd/mdb/Makefile.common
@@ -24,7 +24,8 @@
 #
 # MDB modules used for debugging user processes that every ISA's build
 # subdirectory will need to build.
-# 
+#
+ 
 COMMON_MODULES_PROC = \
 	dof \
 	libavl \
@@ -70,6 +71,7 @@ COMMON_MODULES_KVM = \
 	krtld \
 	lofs \
 	logindmux \
+	mac \
 	md \
 	nca \
 	nsctl \
diff --git a/usr/src/cmd/mdb/common/modules/mac/mac.c b/usr/src/cmd/mdb/common/modules/mac/mac.c
new file mode 100644
index 0000000000..0f1effb4b2
--- /dev/null
+++ b/usr/src/cmd/mdb/common/modules/mac/mac.c
@@ -0,0 +1,685 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/mdb_modapi.h>
+#include <sys/types.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+
+#include <sys/mac.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_flow_impl.h>
+#include <sys/mac_soft_ring.h>
+
+#define	STRSIZE	64
+#define	MAC_RX_SRS_SIZE	 (MAX_RINGS_PER_GROUP * sizeof (uintptr_t))
+
+#define	LAYERED_WALKER_FOR_FLOW	"flow_entry_cache"
+#define	LAYERED_WALKER_FOR_SRS	"mac_srs_cache"
+#define	LAYERED_WALKER_FOR_RING	"mac_ring_cache"
+
+/* arguments passed to mac_flow dee-command */
+#define	MAC_FLOW_NONE	0x01
+#define	MAC_FLOW_ATTR	0x02
+#define	MAC_FLOW_PROP	0x04
+#define	MAC_FLOW_RX	0x08
+#define	MAC_FLOW_TX	0x10
+#define	MAC_FLOW_USER	0x20
+#define	MAC_FLOW_STATS	0x40
+#define	MAC_FLOW_MISC	0x80
+
+/* arguments passed to mac_srs dee-command */
+#define	MAC_SRS_RX	0x01
+#define	MAC_SRS_TX	0x02
+
+static char *
+mac_flow_proto2str(uint8_t protocol)
+{
+	switch (protocol) {
+	case IPPROTO_TCP:
+		return ("tcp");
+	case IPPROTO_UDP:
+		return ("udp");
+	case IPPROTO_SCTP:
+		return ("sctp");
+	case IPPROTO_ICMP:
+		return ("icmp");
+	case IPPROTO_ICMPV6:
+		return ("icmpv6");
+	default:
+		return ("--");
+	}
+}
+
+static char *
+mac_flow_priority2str(mac_priority_level_t prio)
+{
+	switch (prio) {
+	case MPL_LOW:
+		return ("low");
+	case MPL_MEDIUM:
+		return ("medium");
+	case MPL_HIGH:
+		return ("high");
+	case MPL_RESET:
+		return ("reset");
+	default:
+		return ("--");
+	}
+}
+
+/*
+ *  Convert bandwidth in bps to a string in mpbs.
+ */
+static char *
+mac_flow_bw2str(uint64_t bw, char *buf, ssize_t len)
+{
+	int kbps, mbps;
+
+	kbps = (bw % 1000000)/1000;
+	mbps = bw/1000000;
+	if ((mbps == 0) && (kbps != 0))
+		mdb_snprintf(buf, len, "0.%03u", kbps);
+	else
+		mdb_snprintf(buf, len, "%5u", mbps);
+	return (buf);
+}
+
+static void
+mac_flow_print_header(uint_t args)
+{
+	switch (args) {
+	case MAC_FLOW_NONE:
+		mdb_printf("%<u>%?s %-32s %-6s %?s %?s %-20s%</u>\n",
+		    "ADDR", "FLOW NAME", "LINKID", "MCIP", "MIP",
+		    "MIP NAME");
+		break;
+	case MAC_FLOW_ATTR:
+		mdb_printf("%<u>%?s %-32s %-7s %6s "
+		    "%-9s %s%</u>\n",
+		    "ADDR", "FLOW NAME", "PROTO", "PORT",
+		    "DSFLD:MSK", "IPADDR");
+		break;
+	case MAC_FLOW_PROP:
+		mdb_printf("%<u>%?s %-32s %8s %9s%</u>\n",
+		    "ADDR", "FLOW NAME", "MAXBW(M)", "PRIORITY");
+		break;
+	case MAC_FLOW_MISC:
+		mdb_printf("%<u>%?s %-32s %10s %10s "
+		    "%32s %s%</u>\n",
+		    "ADDR", "FLOW NAME", "TYPE", "FLAGS",
+		    "MATCH_FN", "ZONE");
+		break;
+	case MAC_FLOW_RX:
+		mdb_printf("%<u>%?s %-24s %-30s %?s "
+		    "%?s %7s %s%</u>\n",
+		    "ADDR", "FLOW NAME", "CB_FUNC", "CB_ARG1",
+		    "CB_ARG2", "SRS_CNT", "RX_SRS");
+		break;
+	case MAC_FLOW_TX:
+		mdb_printf("%<u>%?s %-32s %?s %</u>\n",
+		    "ADDR", "FLOW NAME", "TX_SRS");
+		break;
+	case MAC_FLOW_STATS:
+		mdb_printf("%<u>%?s %-32s %?s %?s%</u>\n",
+		    "ADDR", "FLOW NAME", "RBYTES", "OBYTES");
+		break;
+	}
+}
+
+/*
+ * Display selected fields of the flow_entry_t structure
+ */
+static int
+mac_flow_dcmd_output(uintptr_t addr, uint_t flags, uint_t args)
+{
+	static const mdb_bitmask_t flow_type_bits[] = {
+		{"P", FLOW_PRIMARY_MAC, FLOW_PRIMARY_MAC},
+		{"V", FLOW_VNIC_MAC, FLOW_VNIC_MAC},
+		{"M", FLOW_MCAST, FLOW_MCAST},
+		{"O", FLOW_OTHER, FLOW_OTHER},
+		{"U", FLOW_USER, FLOW_USER},
+		{"V", FLOW_VNIC, FLOW_VNIC},
+		{"NS", FLOW_NO_STATS, FLOW_NO_STATS},
+		{ NULL, 0, 0 }
+	};
+#define	FLOW_MAX_TYPE	(sizeof (flow_type_bits) / sizeof (mdb_bitmask_t))
+
+	static const mdb_bitmask_t flow_flag_bits[] = {
+		{"Q", FE_QUIESCE, FE_QUIESCE},
+		{"W", FE_WAITER, FE_WAITER},
+		{"T", FE_FLOW_TAB, FE_FLOW_TAB},
+		{"G", FE_G_FLOW_HASH, FE_G_FLOW_HASH},
+		{"I", FE_INCIPIENT, FE_INCIPIENT},
+		{"C", FE_CONDEMNED, FE_CONDEMNED},
+		{"NU", FE_UF_NO_DATAPATH, FE_UF_NO_DATAPATH},
+		{"NC", FE_MC_NO_DATAPATH, FE_MC_NO_DATAPATH},
+		{ NULL, 0, 0 }
+	};
+#define	FLOW_MAX_FLAGS	(sizeof (flow_flag_bits) / sizeof (mdb_bitmask_t))
+	flow_entry_t		fe;
+	mac_client_impl_t	mcip;
+	mac_impl_t		mip;
+
+	if (mdb_vread(&fe, sizeof (fe), addr) == -1) {
+		mdb_warn("failed to read struct flow_entry_s at %p", addr);
+		return (DCMD_ERR);
+	}
+	if (args & MAC_FLOW_USER) {
+		args &= ~MAC_FLOW_USER;
+		if (fe.fe_type & FLOW_MCAST) {
+			if (DCMD_HDRSPEC(flags))
+				mac_flow_print_header(args);
+			return (DCMD_OK);
+		}
+	}
+	if (DCMD_HDRSPEC(flags))
+		mac_flow_print_header(args);
+	bzero(&mcip, sizeof (mcip));
+	bzero(&mip, sizeof (mip));
+	if (fe.fe_mcip != NULL && mdb_vread(&mcip, sizeof (mcip),
+	    (uintptr_t)fe.fe_mcip) == sizeof (mcip)) {
+		(void) mdb_vread(&mip, sizeof (mip), (uintptr_t)mcip.mci_mip);
+	}
+	switch (args) {
+	case MAC_FLOW_NONE: {
+		mdb_printf("%?p %-32s %6d %?p "
+		    "%?p %-20s\n",
+		    addr, fe.fe_flow_name, fe.fe_link_id, fe.fe_mcip,
+		    mcip.mci_mip, mip.mi_name);
+		break;
+	}
+	case MAC_FLOW_ATTR: {
+		struct 	in_addr	in4;
+		uintptr_t	desc_addr;
+		flow_desc_t	fdesc;
+
+		desc_addr = addr + OFFSETOF(flow_entry_t, fe_flow_desc);
+		if (mdb_vread(&fdesc, sizeof (fdesc), desc_addr) == -1) {
+			mdb_warn("failed to read struct flow_description at %p",
+			    desc_addr);
+			return (DCMD_ERR);
+		}
+		mdb_printf("%?p %-32s "
+		    "%-7s %6d"
+		    "%4d:%-4d ",
+		    addr, fe.fe_flow_name,
+		    mac_flow_proto2str(fdesc.fd_protocol), fdesc.fd_local_port,
+		    fdesc.fd_dsfield, fdesc.fd_dsfield_mask);
+		if (fdesc.fd_ipversion == IPV4_VERSION) {
+			IN6_V4MAPPED_TO_INADDR(&fdesc.fd_local_addr, &in4);
+			mdb_printf("%I", in4.s_addr);
+		} else if (fdesc.fd_ipversion == IPV6_VERSION) {
+			mdb_printf("%N", &fdesc.fd_local_addr);
+		} else {
+			mdb_printf("%s", "--");
+		}
+		mdb_printf("\n");
+		break;
+	}
+	case MAC_FLOW_PROP: {
+		uintptr_t	prop_addr;
+		char		bwstr[STRSIZE];
+		mac_resource_props_t	fprop;
+
+		prop_addr = addr + OFFSETOF(flow_entry_t, fe_resource_props);
+		if (mdb_vread(&fprop, sizeof (fprop), prop_addr) == -1) {
+			mdb_warn("failed to read struct mac_resoource_props "
+			    "at %p", prop_addr);
+			return (DCMD_ERR);
+		}
+		mdb_printf("%?p %-32s "
+		    "%8s %9s\n",
+		    addr, fe.fe_flow_name,
+		    mac_flow_bw2str(fprop.mrp_maxbw, bwstr, STRSIZE),
+		    mac_flow_priority2str(fprop.mrp_priority));
+		break;
+	}
+	case MAC_FLOW_MISC: {
+		char		flow_flags[2 * FLOW_MAX_FLAGS];
+		char		flow_type[2 * FLOW_MAX_TYPE];
+		GElf_Sym 	sym;
+		char		func_name[MDB_SYM_NAMLEN] = "";
+		uintptr_t	func, match_addr;
+
+		match_addr = addr + OFFSETOF(flow_entry_t, fe_match);
+		(void) mdb_vread(&func, sizeof (func), match_addr);
+		(void) mdb_lookup_by_addr(func, MDB_SYM_EXACT, func_name,
+		    MDB_SYM_NAMLEN, &sym);
+		mdb_snprintf(flow_flags, 2 * FLOW_MAX_FLAGS, "%hb",
+		    fe.fe_flags, flow_flag_bits);
+		mdb_snprintf(flow_type, 2 * FLOW_MAX_TYPE, "%hb",
+		    fe.fe_type, flow_type_bits);
+		mdb_printf("%?p %-32s %10s %10s "
+		    "%32s %-d\n",
+		    addr, fe.fe_flow_name, flow_type, flow_flags,
+		    func_name, fe.fe_zoneid);
+		break;
+	}
+	case MAC_FLOW_RX: {
+		uintptr_t	rx_srs[MAX_RINGS_PER_GROUP] = {0};
+		char 		cb_fn[MDB_SYM_NAMLEN] = "";
+		uintptr_t	cb_fnaddr, fnaddr, rxaddr;
+		int		i;
+		GElf_Sym 	sym;
+
+		rxaddr = addr + OFFSETOF(flow_entry_t, fe_rx_srs);
+		(void) mdb_vread(rx_srs, MAC_RX_SRS_SIZE, rxaddr);
+		fnaddr = addr + OFFSETOF(flow_entry_t, fe_cb_fn);
+		(void) mdb_vread(&cb_fnaddr, sizeof (cb_fnaddr), fnaddr);
+		(void) mdb_lookup_by_addr(cb_fnaddr, MDB_SYM_EXACT, cb_fn,
+		    MDB_SYM_NAMLEN, &sym);
+		mdb_printf("%?p %-24s %-30s %?p "
+		    "%?p %7d ",
+		    addr, fe.fe_flow_name, cb_fn, fe.fe_cb_arg1,
+		    fe.fe_cb_arg2, fe.fe_rx_srs_cnt);
+		for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
+			if (rx_srs[i] == 0)
+				continue;
+			mdb_printf("%p ", rx_srs[i]);
+		}
+		mdb_printf("\n");
+		break;
+	}
+	case MAC_FLOW_TX: {
+		uintptr_t	tx_srs = 0, txaddr;
+
+		txaddr = addr + OFFSETOF(flow_entry_t, fe_tx_srs);
+		(void) mdb_vread(&tx_srs, sizeof (uintptr_t), txaddr);
+		mdb_printf("%?p %-32s %?p\n",
+		    addr, fe.fe_flow_name, fe.fe_tx_srs);
+		break;
+	}
+	case MAC_FLOW_STATS: {
+		mdb_printf("%?p %-32s %16llu %16llu\n",
+		    addr, fe.fe_flow_name, fe.fe_flowstats.fs_rbytes,
+		    fe.fe_flowstats.fs_obytes);
+		break;
+	}
+	}
+	return (DCMD_OK);
+}
+
+/*
+ * Parse the arguments passed to the dcmd and print all or one flow_entry_t
+ * structures
+ */
+static int
+mac_flow_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	uint_t	args = 0;
+
+	if (!(flags & DCMD_ADDRSPEC)) {
+		if (mdb_walk_dcmd("mac_flow", "mac_flow", argc, argv) == -1) {
+			mdb_warn("failed to walk 'mac_flow'");
+			return (DCMD_ERR);
+		}
+		return (DCMD_OK);
+	}
+	if ((mdb_getopts(argc, argv,
+	    'a', MDB_OPT_SETBITS, MAC_FLOW_ATTR, &args,
+	    'p', MDB_OPT_SETBITS, MAC_FLOW_PROP, &args,
+	    'm', MDB_OPT_SETBITS, MAC_FLOW_MISC, &args,
+	    'r', MDB_OPT_SETBITS, MAC_FLOW_RX, &args,
+	    't', MDB_OPT_SETBITS, MAC_FLOW_TX, &args,
+	    's', MDB_OPT_SETBITS, MAC_FLOW_STATS, &args,
+	    'u', MDB_OPT_SETBITS, MAC_FLOW_USER, &args) != argc)) {
+		return (DCMD_USAGE);
+	}
+	if (argc > 2 || (argc == 2 && !(args & MAC_FLOW_USER)))
+		return (DCMD_USAGE);
+	/*
+	 * If no arguments was specified or just "-u" was specified then
+	 * we default to printing basic information of flows.
+	 */
+	if (args == 0 || args == MAC_FLOW_USER)
+		args |= MAC_FLOW_NONE;
+
+	return (mac_flow_dcmd_output(addr, flags, args));
+}
+
+static void
+mac_flow_help(void)
+{
+	mdb_printf("If an address is specified, then flow_entry structure at "
+	    "that address is printed. Otherwise all the flows in the system "
+	    "are printed.\n");
+	mdb_printf("Options:\n"
+	    "\t-u\tdisplay user defined link & vnic flows.\n"
+	    "\t-a\tdisplay flow attributes\n"
+	    "\t-p\tdisplay flow properties\n"
+	    "\t-r\tdisplay rx side information\n"
+	    "\t-t\tdisplay tx side information\n"
+	    "\t-s\tdisplay flow statistics\n"
+	    "\t-m\tdisplay miscellaneous flow information\n\n");
+	mdb_printf("%<u>Interpreting Flow type and Flow flags output.%</u>\n");
+	mdb_printf("Flow Types:\n");
+	mdb_printf("\t  P --> FLOW_PRIMARY_MAC\n");
+	mdb_printf("\t  V --> FLOW_VNIC_MAC\n");
+	mdb_printf("\t  M --> FLOW_MCAST\n");
+	mdb_printf("\t  O --> FLOW_OTHER\n");
+	mdb_printf("\t  U --> FLOW_USER\n");
+	mdb_printf("\t NS --> FLOW_NO_STATS\n\n");
+	mdb_printf("Flow Flags:\n");
+	mdb_printf("\t  Q --> FE_QUIESCE\n");
+	mdb_printf("\t  W --> FE_WAITER\n");
+	mdb_printf("\t  T --> FE_FLOW_TAB\n");
+	mdb_printf("\t  G --> FE_G_FLOW_HASH\n");
+	mdb_printf("\t  I --> FE_INCIPIENT\n");
+	mdb_printf("\t  C --> FE_CONDEMNED\n");
+	mdb_printf("\t NU --> FE_UF_NO_DATAPATH\n");
+	mdb_printf("\t NC --> FE_MC_NO_DATAPATH\n");
+}
+
+/*
+ * called once by the debugger when the mac_flow walk begins.
+ */
+static int
+mac_flow_walk_init(mdb_walk_state_t *wsp)
+{
+	if (mdb_layered_walk(LAYERED_WALKER_FOR_FLOW, wsp) == -1) {
+		mdb_warn("failed to walk 'mac_flow'");
+		return (WALK_ERR);
+	}
+	return (WALK_NEXT);
+}
+
+/*
+ * Common walker step funciton for flow_entry_t, mac_soft_ring_set_t and
+ * mac_ring_t.
+ *
+ * Steps through each flow_entry_t and calls the callback function. If the
+ * user executed ::walk mac_flow, it just prints the address or if the user
+ * executed ::mac_flow it displays selected fields of flow_entry_t structure
+ * by calling "mac_flow_dcmd"
+ */
+static int
+mac_common_walk_step(mdb_walk_state_t *wsp)
+{
+	int status;
+
+	if (wsp->walk_addr == NULL)
+		return (WALK_DONE);
+
+	status = wsp->walk_callback(wsp->walk_addr, wsp->walk_data,
+	    wsp->walk_cbdata);
+
+	return (status);
+}
+
+static char *
+mac_srs_txmode2str(mac_tx_srs_mode_t mode)
+{
+	switch (mode) {
+	case SRS_TX_DEFAULT:
+		return ("default");
+	case SRS_TX_SERIALIZE:
+		return ("serialize");
+	case SRS_TX_FANOUT:
+		return ("fanout");
+	case SRS_TX_BW:
+		return ("bw");
+	case SRS_TX_BW_FANOUT:
+		return ("bw fanout");
+	}
+	return ("--");
+}
+
+static void
+mac_srs_help(void)
+{
+	mdb_printf("If an address is specified, then mac_soft_ring_set "
+	    "structure at that address is printed. Otherwise all the "
+	    "SRS in the system are printed.\n");
+	mdb_printf("Options:\n"
+	    "\t-r\tdisplay recieve side SRS structures\n"
+	    "\t-t\tdisplay transmit side SRS structures\n");
+}
+
+static int
+mac_srs_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	uint_t			args = 0;
+	mac_soft_ring_set_t	srs;
+
+	if (!(flags & DCMD_ADDRSPEC)) {
+		if (mdb_walk_dcmd("mac_srs", "mac_srs", argc, argv) == -1) {
+			mdb_warn("failed to walk 'mac_srs'");
+			return (DCMD_ERR);
+		}
+		return (DCMD_OK);
+	}
+	if ((mdb_getopts(argc, argv,
+	    'r', MDB_OPT_SETBITS, MAC_SRS_RX, &args,
+	    't', MDB_OPT_SETBITS, MAC_SRS_TX, &args) != argc)) {
+		return (DCMD_USAGE);
+	}
+	if (argc > 1)
+		return (DCMD_USAGE);
+
+	if (mdb_vread(&srs, sizeof (srs), addr) == -1) {
+		mdb_warn("failed to read struct mac_soft_ring_set_s at %p",
+		    addr);
+		return (DCMD_ERR);
+	}
+
+	switch (args) {
+	case MAC_SRS_RX: {
+		GElf_Sym 	sym;
+		char		func_name[MDB_SYM_NAMLEN] = "";
+		char		l_proc_name[MDB_SYM_NAMLEN] = "";
+		uintptr_t	func, lproc, funcaddr, lprocaddr, rxaddr;
+
+		if (DCMD_HDRSPEC(flags)) {
+			mdb_printf("%<u>%?s %8s %-8s "
+			    "%8s %-20s %-s%</u>\n",
+			    "ADDR", "MBLK_CNT", "Q_BYTES",
+			    "POLL_CNT", "SR_FUNC", "SR_LOWER_FUNC");
+		}
+		if (srs.srs_type & SRST_TX)
+			return (DCMD_OK);
+		rxaddr = addr + OFFSETOF(mac_soft_ring_set_t, srs_rx);
+		funcaddr = rxaddr + OFFSETOF(mac_srs_rx_t, sr_func);
+		lprocaddr = rxaddr + OFFSETOF(mac_srs_rx_t, sr_lower_proc);
+		(void) mdb_vread(&func, sizeof (func), funcaddr);
+		(void) mdb_vread(&lproc, sizeof (lproc), lprocaddr);
+		(void) mdb_lookup_by_addr(func, MDB_SYM_EXACT, func_name,
+		    MDB_SYM_NAMLEN, &sym);
+		(void) mdb_lookup_by_addr(lproc, MDB_SYM_EXACT, l_proc_name,
+		    MDB_SYM_NAMLEN, &sym);
+		mdb_printf("%?p %-8d %-8d "
+		    "%-8d %-20s %-s\n",
+		    addr, srs.srs_count, srs.srs_size,
+		    srs.srs_rx.sr_poll_count, func_name, l_proc_name);
+		break;
+	}
+	case MAC_SRS_TX: {
+		if (DCMD_HDRSPEC(flags)) {
+			mdb_printf("%<u>%?s %-10s %-5s %-7s %-7s "
+			    "%-7s %-7s %-7s%</u>\n",
+			    "ADDR", "TX_MODE", "WOKEN", "DROP", "BLOCK",
+			    "UNBLOCK", "MBLK", "SR_CNT");
+		}
+		if (!(srs.srs_type & SRST_TX))
+			return (DCMD_OK);
+
+		mdb_printf("%?p %-10s "
+		    "%-5d %-7d "
+		    "%-7d %-7d "
+		    "%-7d %-7d\n",
+		    addr, mac_srs_txmode2str(srs.srs_tx.st_mode),
+		    srs.srs_tx.st_woken_up, srs.srs_tx.st_drop_count,
+		    srs.srs_tx.st_blocked_cnt, srs.srs_tx.st_unblocked_cnt,
+		    srs.srs_count, srs.srs_oth_ring_count);
+		break;
+	}
+	default: {
+		if (DCMD_HDRSPEC(flags)) {
+			mdb_printf("%<u>%?s %?s %?s %?s %-3s "
+			    "%-8s %-8s %-7s %</u>\n",
+			    "ADDR", "MCIP", "FLENT", "RING", "DIR",
+			    "TYPE", "STATE", "SR_CNT");
+		}
+		mdb_printf("%?p %?p %?p %?p "
+		    "%-3s "
+		    "%08x %08x %-7d \n",
+		    addr, srs.srs_mcip, srs.srs_flent, srs.srs_ring,
+		    (srs.srs_type & SRST_TX ? "TX" : "RX"),
+		    srs.srs_type, srs.srs_state, srs.srs_soft_ring_count);
+		break;
+	}
+	}
+	return (DCMD_OK);
+}
+
+static int
+mac_srs_walk_init(mdb_walk_state_t *wsp)
+{
+	if (mdb_layered_walk(LAYERED_WALKER_FOR_SRS, wsp) == -1) {
+		mdb_warn("failed to walk 'mac_srs'");
+		return (WALK_ERR);
+	}
+	return (WALK_NEXT);
+}
+
+static char *
+mac_ring_state2str(mac_ring_state_t state)
+{
+	switch (state) {
+	case MR_FREE:
+		return ("free");
+	case MR_NEWLY_ADDED:
+		return ("new");
+	case MR_INUSE:
+		return ("inuse");
+	}
+	return ("--");
+}
+
+static char *
+mac_ring_classify2str(mac_classify_type_t classify)
+{
+	switch (classify) {
+	case MAC_NO_CLASSIFIER:
+		return ("no");
+	case MAC_SW_CLASSIFIER:
+		return ("sw");
+	case MAC_HW_CLASSIFIER:
+		return ("hw");
+	}
+	return ("--");
+}
+
+static int
+mac_ring_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	mac_ring_t		ring;
+	mac_group_t		group;
+	flow_entry_t		flent;
+	mac_soft_ring_set_t	srs;
+
+	if (!(flags & DCMD_ADDRSPEC)) {
+		if (mdb_walk_dcmd("mac_ring", "mac_ring", argc, argv) == -1) {
+			mdb_warn("failed to walk 'mac_ring'");
+			return (DCMD_ERR);
+		}
+		return (DCMD_OK);
+	}
+	if (mdb_vread(&ring, sizeof (ring), addr) == -1) {
+		mdb_warn("failed to read struct mac_ring_s at %p", addr);
+		return (DCMD_ERR);
+	}
+	bzero(&flent, sizeof (flent));
+	if (mdb_vread(&srs, sizeof (srs), (uintptr_t)ring.mr_srs) != -1) {
+		(void) mdb_vread(&flent, sizeof (flent),
+		    (uintptr_t)srs.srs_flent);
+	}
+	(void) mdb_vread(&group, sizeof (group), (uintptr_t)ring.mr_gh);
+	if (DCMD_HDRSPEC(flags)) {
+		mdb_printf("%<u>%?s %4s %5s %4s %?s "
+		    "%5s %?s %?s %s %</u>\n",
+		    "ADDR", "TYPE", "STATE", "FLAG", "GROUP",
+		    "CLASS", "MIP", "SRS", "FLOW NAME");
+	}
+	mdb_printf("%?p %-4s "
+	    "%5s %04x "
+	    "%?p %-5s "
+	    "%?p %?p %s\n",
+	    addr, ((ring.mr_type == 1)? "RX" : "TX"),
+	    mac_ring_state2str(ring.mr_state), ring.mr_flag,
+	    ring.mr_gh, mac_ring_classify2str(ring.mr_classify_type),
+	    group.mrg_mh, ring.mr_srs, flent.fe_flow_name);
+	return (DCMD_OK);
+}
+
+static int
+mac_ring_walk_init(mdb_walk_state_t *wsp)
+{
+	if (mdb_layered_walk(LAYERED_WALKER_FOR_RING, wsp) == -1) {
+		mdb_warn("failed to walk `mac_ring`");
+		return (WALK_ERR);
+	}
+	return (WALK_NEXT);
+}
+
+static void
+mac_ring_help(void)
+{
+	mdb_printf("If an address is specified, then mac_ring_t "
+	    "structure at that address is printed. Otherwise all the "
+	    "hardware rings in the system are printed.\n");
+}
+
+/* Supported dee-commands */
+static const mdb_dcmd_t dcmds[] = {
+	{"mac_flow", "?[-u] [-aprtsm]", "display Flow Entry structures",
+	    mac_flow_dcmd, mac_flow_help},
+	{"mac_srs", "?[-rt]", "display MAC Soft Ring Set structures",
+	    mac_srs_dcmd, mac_srs_help},
+	{"mac_ring", "?", "display MAC ring (hardware) structures",
+	    mac_ring_dcmd, mac_ring_help},
+	{ NULL }
+};
+
+/* Supported walkers */
+static const mdb_walker_t walkers[] = {
+	{"mac_flow", "walk list of flow entry structures", mac_flow_walk_init,
+	    mac_common_walk_step, NULL, NULL},
+	{"mac_srs", "walk list of mac soft ring set structures",
+	    mac_srs_walk_init, mac_common_walk_step, NULL, NULL},
+	{"mac_ring", "walk list of mac ring structures", mac_ring_walk_init,
+	    mac_common_walk_step, NULL, NULL},
+	{ NULL }
+};
+
+static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, walkers };
+
+const mdb_modinfo_t *
+_mdb_init(void)
+{
+	return (&modinfo);
+}
diff --git a/usr/src/cmd/mdb/intel/amd64/mac/Makefile b/usr/src/cmd/mdb/intel/amd64/mac/Makefile
new file mode 100644
index 0000000000..6f24b28ea6
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/amd64/mac/Makefile
@@ -0,0 +1,34 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+MODULE = mac.so
+MDBTGT = kvm
+
+MODSRCS = mac.c
+
+include ../../../../Makefile.cmd
+include ../../../../Makefile.cmd.64
+include ../../Makefile.amd64
+include ../../../Makefile.module
diff --git a/usr/src/cmd/mdb/intel/ia32/mac/Makefile b/usr/src/cmd/mdb/intel/ia32/mac/Makefile
new file mode 100644
index 0000000000..69c8c97b19
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/ia32/mac/Makefile
@@ -0,0 +1,33 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms. 
+#
+
+MODULE = mac.so
+MDBTGT = kvm
+
+MODSRCS = mac.c
+
+include ../../../../Makefile.cmd
+include ../../Makefile.ia32
+include ../../../Makefile.module
diff --git a/usr/src/cmd/mdb/sparc/v9/mac/Makefile b/usr/src/cmd/mdb/sparc/v9/mac/Makefile
new file mode 100644
index 0000000000..1456211245
--- /dev/null
+++ b/usr/src/cmd/mdb/sparc/v9/mac/Makefile
@@ -0,0 +1,34 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+MODULE = mac.so
+MDBTGT = kvm
+
+MODSRCS = mac.c
+
+include ../../../../Makefile.cmd
+include ../../../../Makefile.cmd.64
+include ../../Makefile.sparcv9
+include ../../../Makefile.module
diff --git a/usr/src/cmd/rcm_daemon/Makefile.com b/usr/src/cmd/rcm_daemon/Makefile.com
index a7293e76f1..365371c45c 100644
--- a/usr/src/cmd/rcm_daemon/Makefile.com
+++ b/usr/src/cmd/rcm_daemon/Makefile.com
@@ -22,8 +22,6 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
-#
 
 include ../../Makefile.cmd
 
@@ -51,6 +49,7 @@ COMMON_MOD_SRC = \
 	$(COMMON)/swap_rcm.c \
 	$(COMMON)/network_rcm.c \
 	$(COMMON)/vlan_rcm.c \
+	$(COMMON)/vnic_rcm.c \
 	$(COMMON)/aggr_rcm.c \
 	$(COMMON)/ip_rcm.c \
 	$(COMMON)/cluster_rcm.c \
@@ -71,6 +70,7 @@ COMMON_MOD_OBJ = \
 	swap_rcm.o \
 	network_rcm.o \
 	vlan_rcm.o \
+	vnic_rcm.o \
 	aggr_rcm.o \
 	ip_rcm.o \
 	cluster_rcm.o \
@@ -89,6 +89,7 @@ COMMON_RCM_MODS = \
 	SUNW_swap_rcm.so \
 	SUNW_network_rcm.so \
 	SUNW_vlan_rcm.so \
+	SUNW_vnic_rcm.so \
 	SUNW_aggr_rcm.so \
 	SUNW_ip_rcm.so \
 	SUNW_cluster_rcm.so \
@@ -121,6 +122,7 @@ SUNW_pool_rcm.so := LDLIBS_MODULES += -L$(ROOT)/usr/lib -lpool
 SUNW_svm_rcm.so	:= LDLIBS_MODULES += -L$(ROOT)/usr/lib -lmeta
 SUNW_network_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm
 SUNW_vlan_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm
+SUNW_vnic_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm
 SUNW_aggr_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm
 SUNW_ip_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil -ldladm
 SUNW_ip_anon_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil
diff --git a/usr/src/cmd/rcm_daemon/common/vlan_rcm.c b/usr/src/cmd/rcm_daemon/common/vlan_rcm.c
index 1177d5e384..a657baa2d4 100644
--- a/usr/src/cmd/rcm_daemon/common/vlan_rcm.c
+++ b/usr/src/cmd/rcm_daemon/common/vlan_rcm.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This RCM module adds support to the RCM framework for VLAN links
  */
@@ -68,7 +66,6 @@ typedef struct dl_vlan {
 	struct dl_vlan	*dv_next;		/* next VLAN on the same link */
 	struct dl_vlan	*dv_prev;		/* prev VLAN on the same link */
 	datalink_id_t	dv_vlanid;
-	boolean_t	dv_implicit;
 	vlan_flag_t	dv_flags;		/* VLAN link flags */
 } dl_vlan_t;
 
@@ -399,7 +396,6 @@ vlan_online_vlan(link_cache_t *node)
 		if (!(vlan->dv_flags & VLAN_OFFLINED))
 			continue;
 
-		assert(!vlan->dv_implicit);
 		if ((status = dladm_vlan_up(vlan->dv_vlanid)) !=
 		    DLADM_STATUS_OK) {
 			/*
@@ -429,10 +425,6 @@ vlan_offline_vlan(link_cache_t *node, uint32_t flags, cache_node_state_t state)
 	 * Try to delete all explicit created VLAN
 	 */
 	for (vlan = node->vc_vlan; vlan != NULL; vlan = vlan->dv_next) {
-
-		if (vlan->dv_implicit)
-			continue;
-
 		if ((status = dladm_vlan_delete(vlan->dv_vlanid,
 		    DLADM_OPT_ACTIVE)) != DLADM_STATUS_OK) {
 			rcm_log_message(RCM_WARNING,
@@ -918,7 +910,6 @@ vlan_update(datalink_id_t vlanid, void *arg)
 		node->vc_vlan = vlan;
 	}
 
-	vlan->dv_implicit = vlan_attr.dv_implicit;
 	node->vc_state &= ~CACHE_NODE_STALE;
 
 	if (newnode)
@@ -1186,18 +1177,16 @@ vlan_notify_new_vlan(rcm_handle_t *hd, char *rsrc)
 	}
 
 	for (vlan = node->vc_vlan; vlan != NULL; vlan = vlan->dv_next) {
-		if (!vlan->dv_implicit) {
-			rcm_log_message(RCM_TRACE2,
-			    "VLAN: vlan_notify_new_vlan add (%u)\n",
-			    vlan->dv_vlanid);
+		rcm_log_message(RCM_TRACE2,
+		    "VLAN: vlan_notify_new_vlan add (%u)\n",
+		    vlan->dv_vlanid);
 
-			id = vlan->dv_vlanid;
-			if (nvlist_add_uint64(nvl, RCM_NV_LINKID, id) != 0) {
-				rcm_log_message(RCM_ERROR,
-				    _("VLAN: failed to construct nvlist\n"));
-				(void) mutex_unlock(&cache_lock);
-				goto done;
-			}
+		id = vlan->dv_vlanid;
+		if (nvlist_add_uint64(nvl, RCM_NV_LINKID, id) != 0) {
+			rcm_log_message(RCM_ERROR,
+			    _("VLAN: failed to construct nvlist\n"));
+			(void) mutex_unlock(&cache_lock);
+			goto done;
 		}
 	}
 	(void) mutex_unlock(&cache_lock);
diff --git a/usr/src/cmd/rcm_daemon/common/vnic_rcm.c b/usr/src/cmd/rcm_daemon/common/vnic_rcm.c
new file mode 100644
index 0000000000..178d3b44a8
--- /dev/null
+++ b/usr/src/cmd/rcm_daemon/common/vnic_rcm.c
@@ -0,0 +1,1329 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This RCM module adds support to the RCM framework for VNIC links
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <synch.h>
+#include <assert.h>
+#include <strings.h>
+#include "rcm_module.h"
+#include <libintl.h>
+#include <libdllink.h>
+#include <libdlvnic.h>
+#include <libdlpi.h>
+
+/*
+ * Definitions
+ */
+#ifndef lint
+#define	_(x)	gettext(x)
+#else
+#define	_(x)	x
+#endif
+
+/* Some generic well-knowns and defaults used in this module */
+#define	RCM_LINK_PREFIX		"SUNW_datalink"	/* RCM datalink name prefix */
+#define	RCM_LINK_RESOURCE_MAX	(13 + LINKID_STR_WIDTH)
+
+/* VNIC link flags */
+typedef enum {
+	VNIC_OFFLINED		= 0x1,
+	VNIC_CONSUMER_OFFLINED	= 0x2,
+	VNIC_STALE		= 0x4
+} vnic_flag_t;
+
+/* link representation */
+typedef struct dl_vnic {
+	struct dl_vnic	*dlv_next;		/* next VNIC on the same link */
+	struct dl_vnic	*dlv_prev;		/* prev VNIC on the same link */
+	datalink_id_t	dlv_vnic_id;
+	vnic_flag_t	dlv_flags;		/* VNIC link flags */
+} dl_vnic_t;
+
+/* VNIC Cache state flags */
+typedef enum {
+	CACHE_NODE_STALE	= 0x1,		/* stale cached data */
+	CACHE_NODE_NEW		= 0x2,		/* new cached nodes */
+	CACHE_NODE_OFFLINED	= 0x4		/* nodes offlined */
+} cache_node_state_t;
+
+/* Network Cache lookup options */
+#define	CACHE_NO_REFRESH	0x1		/* cache refresh not needed */
+#define	CACHE_REFRESH		0x2		/* refresh cache */
+
+/* Cache element */
+typedef struct link_cache {
+	struct link_cache	*vc_next;	/* next cached resource */
+	struct link_cache	*vc_prev;	/* prev cached resource */
+	char			*vc_resource;	/* resource name */
+	datalink_id_t		vc_linkid;	/* linkid */
+	dl_vnic_t		*vc_vnic;	/* VNIC list on this link */
+	cache_node_state_t	vc_state;	/* cache state flags */
+} link_cache_t;
+
+/*
+ * Global cache for network VNICs
+ */
+static link_cache_t	cache_head;
+static link_cache_t	cache_tail;
+static mutex_t		cache_lock;
+static int		events_registered = 0;
+
+/*
+ * RCM module interface prototypes
+ */
+static int		vnic_register(rcm_handle_t *);
+static int		vnic_unregister(rcm_handle_t *);
+static int		vnic_get_info(rcm_handle_t *, char *, id_t, uint_t,
+			    char **, char **, nvlist_t *, rcm_info_t **);
+static int		vnic_suspend(rcm_handle_t *, char *, id_t,
+			    timespec_t *, uint_t, char **, rcm_info_t **);
+static int		vnic_resume(rcm_handle_t *, char *, id_t, uint_t,
+			    char **, rcm_info_t **);
+static int		vnic_offline(rcm_handle_t *, char *, id_t, uint_t,
+			    char **, rcm_info_t **);
+static int		vnic_undo_offline(rcm_handle_t *, char *, id_t, uint_t,
+			    char **, rcm_info_t **);
+static int		vnic_remove(rcm_handle_t *, char *, id_t, uint_t,
+			    char **, rcm_info_t **);
+static int		vnic_notify_event(rcm_handle_t *, char *, id_t, uint_t,
+			    char **, nvlist_t *, rcm_info_t **);
+static int		vnic_configure(rcm_handle_t *, datalink_id_t);
+
+/* Module private routines */
+static void 		cache_free();
+static int 		cache_update(rcm_handle_t *);
+static void 		cache_remove(link_cache_t *);
+static void 		node_free(link_cache_t *);
+static void 		cache_insert(link_cache_t *);
+static link_cache_t	*cache_lookup(rcm_handle_t *, char *, char);
+static int		vnic_consumer_offline(rcm_handle_t *, link_cache_t *,
+			    char **, uint_t, rcm_info_t **);
+static void		vnic_consumer_online(rcm_handle_t *, link_cache_t *,
+			    char **, uint_t, rcm_info_t **);
+static int		vnic_offline_vnic(link_cache_t *, uint32_t,
+			    cache_node_state_t);
+static void		vnic_online_vnic(link_cache_t *);
+static char 		*vnic_usage(link_cache_t *);
+static void 		vnic_log_err(datalink_id_t, char **, char *);
+static int		vnic_consumer_notify(rcm_handle_t *, datalink_id_t,
+			    char **, uint_t, rcm_info_t **);
+
+/* Module-Private data */
+static struct rcm_mod_ops vnic_ops =
+{
+	RCM_MOD_OPS_VERSION,
+	vnic_register,
+	vnic_unregister,
+	vnic_get_info,
+	vnic_suspend,
+	vnic_resume,
+	vnic_offline,
+	vnic_undo_offline,
+	vnic_remove,
+	NULL,
+	NULL,
+	vnic_notify_event
+};
+
+/*
+ * rcm_mod_init() - Update registrations, and return the ops structure.
+ */
+struct rcm_mod_ops *
+rcm_mod_init(void)
+{
+	rcm_log_message(RCM_TRACE1, "VNIC: mod_init\n");
+
+	cache_head.vc_next = &cache_tail;
+	cache_head.vc_prev = NULL;
+	cache_tail.vc_prev = &cache_head;
+	cache_tail.vc_next = NULL;
+	(void) mutex_init(&cache_lock, 0, NULL);
+
+	/* Return the ops vectors */
+	return (&vnic_ops);
+}
+
+/*
+ * rcm_mod_info() - Return a string describing this module.
+ */
+const char *
+rcm_mod_info(void)
+{
+	rcm_log_message(RCM_TRACE1, "VNIC: mod_info\n");
+
+	return ("VNIC module");
+}
+
+/*
+ * rcm_mod_fini() - Destroy the network VNIC cache.
+ */
+int
+rcm_mod_fini(void)
+{
+	rcm_log_message(RCM_TRACE1, "VNIC: mod_fini\n");
+
+	/*
+	 * Note that vnic_unregister() does not seem to be called anywhere,
+	 * therefore we free the cache nodes here. In theory we should call
+	 * rcm_register_interest() for each node before we free it, the
+	 * framework does not provide the rcm_handle to allow us to do so.
+	 */
+	cache_free();
+	(void) mutex_destroy(&cache_lock);
+	return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_register() - Make sure the cache is properly sync'ed, and its
+ *		 registrations are in order.
+ */
+static int
+vnic_register(rcm_handle_t *hd)
+{
+	rcm_log_message(RCM_TRACE1, "VNIC: register\n");
+
+	if (cache_update(hd) < 0)
+		return (RCM_FAILURE);
+
+	/*
+	 * Need to register interest in all new resources
+	 * getting attached, so we get attach event notifications
+	 */
+	if (!events_registered) {
+		if (rcm_register_event(hd, RCM_RESOURCE_LINK_NEW, 0, NULL)
+		    != RCM_SUCCESS) {
+			rcm_log_message(RCM_ERROR,
+			    _("VNIC: failed to register %s\n"),
+			    RCM_RESOURCE_LINK_NEW);
+			return (RCM_FAILURE);
+		} else {
+			rcm_log_message(RCM_DEBUG, "VNIC: registered %s\n",
+			    RCM_RESOURCE_LINK_NEW);
+			events_registered++;
+		}
+	}
+
+	return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_unregister() - Walk the cache, unregistering all the networks.
+ */
+static int
+vnic_unregister(rcm_handle_t *hd)
+{
+	link_cache_t *node;
+
+	rcm_log_message(RCM_TRACE1, "VNIC: unregister\n");
+
+	/* Walk the cache, unregistering everything */
+	(void) mutex_lock(&cache_lock);
+	node = cache_head.vc_next;
+	while (node != &cache_tail) {
+		if (rcm_unregister_interest(hd, node->vc_resource, 0)
+		    != RCM_SUCCESS) {
+			rcm_log_message(RCM_ERROR,
+			    _("VNIC: failed to unregister %s\n"),
+			    node->vc_resource);
+			(void) mutex_unlock(&cache_lock);
+			return (RCM_FAILURE);
+		}
+		cache_remove(node);
+		node_free(node);
+		node = cache_head.vc_next;
+	}
+	(void) mutex_unlock(&cache_lock);
+
+	/*
+	 * Unregister interest in all new resources
+	 */
+	if (events_registered) {
+		if (rcm_unregister_event(hd, RCM_RESOURCE_LINK_NEW, 0)
+		    != RCM_SUCCESS) {
+			rcm_log_message(RCM_ERROR,
+			    _("VNIC: failed to unregister %s\n"),
+			    RCM_RESOURCE_LINK_NEW);
+			return (RCM_FAILURE);
+		} else {
+			rcm_log_message(RCM_DEBUG, "VNIC: unregistered %s\n",
+			    RCM_RESOURCE_LINK_NEW);
+			events_registered--;
+		}
+	}
+
+	return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_offline() - Offline VNICs on a specific node.
+ */
+static int
+vnic_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
+    char **errorp, rcm_info_t **info)
+{
+	link_cache_t *node;
+
+	rcm_log_message(RCM_TRACE1, "VNIC: offline(%s)\n", rsrc);
+
+	/* Lock the cache and lookup the resource */
+	(void) mutex_lock(&cache_lock);
+	node = cache_lookup(hd, rsrc, CACHE_REFRESH);
+	if (node == NULL) {
+		/* should not happen because the resource is registered. */
+		vnic_log_err(node->vc_linkid, errorp, "unrecognized resource");
+		(void) mutex_unlock(&cache_lock);
+		return (RCM_SUCCESS);
+	}
+
+	/*
+	 * Inform consumers (IP interfaces) of associated VNICs to be offlined
+	 */
+	if (vnic_consumer_offline(hd, node, errorp, flags, info) ==
+	    RCM_SUCCESS) {
+		rcm_log_message(RCM_DEBUG,
+		    "VNIC: consumers agreed on offline\n");
+	} else {
+		vnic_log_err(node->vc_linkid, errorp,
+		    "consumers failed to offline");
+		(void) mutex_unlock(&cache_lock);
+		return (RCM_FAILURE);
+	}
+
+	/* Check if it's a query */
+	if (flags & RCM_QUERY) {
+		rcm_log_message(RCM_TRACE1,
+		    "VNIC: offline query succeeded(%s)\n", rsrc);
+		(void) mutex_unlock(&cache_lock);
+		return (RCM_SUCCESS);
+	}
+
+	if (vnic_offline_vnic(node, VNIC_OFFLINED, CACHE_NODE_OFFLINED) !=
+	    RCM_SUCCESS) {
+		vnic_online_vnic(node);
+		vnic_log_err(node->vc_linkid, errorp, "offline failed");
+		(void) mutex_unlock(&cache_lock);
+		return (RCM_FAILURE);
+	}
+
+	rcm_log_message(RCM_TRACE1, "VNIC: Offline succeeded(%s)\n", rsrc);
+	(void) mutex_unlock(&cache_lock);
+	return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_undo_offline() - Undo offline of a previously offlined node.
+ */
+/*ARGSUSED*/
+static int
+vnic_undo_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
+    char **errorp, rcm_info_t **info)
+{
+	link_cache_t *node;
+
+	rcm_log_message(RCM_TRACE1, "VNIC: online(%s)\n", rsrc);
+
+	(void) mutex_lock(&cache_lock);
+	node = cache_lookup(hd, rsrc, CACHE_NO_REFRESH);
+	if (node == NULL) {
+		vnic_log_err(DATALINK_INVALID_LINKID, errorp, "no such link");
+		(void) mutex_unlock(&cache_lock);
+		errno = ENOENT;
+		return (RCM_FAILURE);
+	}
+
+	/* Check if no attempt should be made to online the link here */
+	if (!(node->vc_state & CACHE_NODE_OFFLINED)) {
+		vnic_log_err(node->vc_linkid, errorp, "link not offlined");
+		(void) mutex_unlock(&cache_lock);
+		errno = ENOTSUP;
+		return (RCM_SUCCESS);
+	}
+
+	vnic_online_vnic(node);
+
+	/*
+	 * Inform IP interfaces on associated VNICs to be onlined
+	 */
+	vnic_consumer_online(hd, node, errorp, flags, info);
+
+	node->vc_state &= ~CACHE_NODE_OFFLINED;
+	rcm_log_message(RCM_TRACE1, "VNIC: online succeeded(%s)\n", rsrc);
+	(void) mutex_unlock(&cache_lock);
+	return (RCM_SUCCESS);
+}
+
+static void
+vnic_online_vnic(link_cache_t *node)
+{
+	dl_vnic_t *vnic;
+	dladm_status_t status;
+	char errmsg[DLADM_STRSIZE];
+
+	/*
+	 * Try to bring on all offlined VNICs
+	 */
+	for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+		if (!(vnic->dlv_flags & VNIC_OFFLINED))
+			continue;
+
+		if ((status = dladm_vnic_up(vnic->dlv_vnic_id, 0)) !=
+		    DLADM_STATUS_OK) {
+			/*
+			 * Print a warning message and continue to online
+			 * other VNICs.
+			 */
+			rcm_log_message(RCM_WARNING,
+			    _("VNIC: VNIC online failed (%u): %s\n"),
+			    vnic->dlv_vnic_id,
+			    dladm_status2str(status, errmsg));
+		} else {
+			vnic->dlv_flags &= ~VNIC_OFFLINED;
+		}
+	}
+}
+
+static int
+vnic_offline_vnic(link_cache_t *node, uint32_t flags, cache_node_state_t state)
+{
+	dl_vnic_t *vnic;
+	dladm_status_t status;
+	char errmsg[DLADM_STRSIZE];
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_offline_vnic (%s %u %u)\n",
+	    node->vc_resource, flags, state);
+
+	/*
+	 * Try to delete all explicit created VNIC
+	 */
+	for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+
+		if ((status = dladm_vnic_delete(vnic->dlv_vnic_id,
+		    DLADM_OPT_ACTIVE)) != DLADM_STATUS_OK) {
+			rcm_log_message(RCM_WARNING,
+			    _("VNIC: VNIC offline failed (%u): %s\n"),
+			    vnic->dlv_vnic_id,
+			    dladm_status2str(status, errmsg));
+			return (RCM_FAILURE);
+		} else {
+			rcm_log_message(RCM_TRACE1,
+			    "VNIC: VNIC offline succeeded(%u)\n",
+			    vnic->dlv_vnic_id);
+			vnic->dlv_flags |= flags;
+		}
+	}
+
+	node->vc_state |= state;
+	return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_get_info() - Gather usage information for this resource.
+ */
+/*ARGSUSED*/
+int
+vnic_get_info(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
+    char **usagep, char **errorp, nvlist_t *props, rcm_info_t **info)
+{
+	link_cache_t *node;
+
+	rcm_log_message(RCM_TRACE1, "VNIC: get_info(%s)\n", rsrc);
+
+	(void) mutex_lock(&cache_lock);
+	node = cache_lookup(hd, rsrc, CACHE_REFRESH);
+	if (node == NULL) {
+		rcm_log_message(RCM_INFO,
+		    _("VNIC: get_info(%s) unrecognized resource\n"), rsrc);
+		(void) mutex_unlock(&cache_lock);
+		errno = ENOENT;
+		return (RCM_FAILURE);
+	}
+
+	*usagep = vnic_usage(node);
+	(void) mutex_unlock(&cache_lock);
+	if (*usagep == NULL) {
+		/* most likely malloc failure */
+		rcm_log_message(RCM_ERROR,
+		    _("VNIC: get_info(%s) malloc failure\n"), rsrc);
+		(void) mutex_unlock(&cache_lock);
+		errno = ENOMEM;
+		return (RCM_FAILURE);
+	}
+
+	/* Set client/role properties */
+	(void) nvlist_add_string(props, RCM_CLIENT_NAME, "VNIC");
+
+	rcm_log_message(RCM_TRACE1, "VNIC: get_info(%s) info = %s\n",
+	    rsrc, *usagep);
+	return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_suspend() - Nothing to do, always okay
+ */
+/*ARGSUSED*/
+static int
+vnic_suspend(rcm_handle_t *hd, char *rsrc, id_t id, timespec_t *interval,
+    uint_t flags, char **errorp, rcm_info_t **info)
+{
+	rcm_log_message(RCM_TRACE1, "VNIC: suspend(%s)\n", rsrc);
+	return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_resume() - Nothing to do, always okay
+ */
+/*ARGSUSED*/
+static int
+vnic_resume(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
+    char **errorp, rcm_info_t **info)
+{
+	rcm_log_message(RCM_TRACE1, "VNIC: resume(%s)\n", rsrc);
+	return (RCM_SUCCESS);
+}
+
+/*
+ * vnic_consumer_remove()
+ *
+ *	Notify VNIC consumers to remove cache.
+ */
+static int
+vnic_consumer_remove(rcm_handle_t *hd, link_cache_t *node, uint_t flags,
+    rcm_info_t **info)
+{
+	dl_vnic_t *vnic = NULL;
+	char rsrc[RCM_LINK_RESOURCE_MAX];
+	int ret = RCM_SUCCESS;
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_remove (%s)\n",
+	    node->vc_resource);
+
+	for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+
+		/*
+		 * This will only be called when the offline operation
+		 * succeeds, so the VNIC consumers must have been offlined
+		 * at this point.
+		 */
+		assert(vnic->dlv_flags & VNIC_CONSUMER_OFFLINED);
+
+		(void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u",
+		    RCM_LINK_PREFIX, vnic->dlv_vnic_id);
+
+		ret = rcm_notify_remove(hd, rsrc, flags, info);
+		if (ret != RCM_SUCCESS) {
+			rcm_log_message(RCM_WARNING,
+			    _("VNIC: notify remove failed (%s)\n"), rsrc);
+			break;
+		}
+	}
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_remove done\n");
+	return (ret);
+}
+
+/*
+ * vnic_remove() - remove a resource from cache
+ */
+/*ARGSUSED*/
+static int
+vnic_remove(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
+    char **errorp, rcm_info_t **info)
+{
+	link_cache_t *node;
+	int rv;
+
+	rcm_log_message(RCM_TRACE1, "VNIC: remove(%s)\n", rsrc);
+
+	(void) mutex_lock(&cache_lock);
+	node = cache_lookup(hd, rsrc, CACHE_NO_REFRESH);
+	if (node == NULL) {
+		rcm_log_message(RCM_INFO,
+		    _("VNIC: remove(%s) unrecognized resource\n"), rsrc);
+		(void) mutex_unlock(&cache_lock);
+		errno = ENOENT;
+		return (RCM_FAILURE);
+	}
+
+	/* remove the cached entry for the resource */
+	cache_remove(node);
+	(void) mutex_unlock(&cache_lock);
+
+	rv = vnic_consumer_remove(hd, node, flags, info);
+	node_free(node);
+	return (rv);
+}
+
+/*
+ * vnic_notify_event - Project private implementation to receive new resource
+ *		   events. It intercepts all new resource events. If the
+ *		   new resource is a network resource, pass up a notify
+ *		   for it too. The new resource need not be cached, since
+ *		   it is done at register again.
+ */
+/*ARGSUSED*/
+static int
+vnic_notify_event(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags,
+    char **errorp, nvlist_t *nvl, rcm_info_t **info)
+{
+	nvpair_t	*nvp = NULL;
+	datalink_id_t	linkid;
+	uint64_t	id64;
+	int		rv = RCM_SUCCESS;
+
+	rcm_log_message(RCM_TRACE1, "VNIC: notify_event(%s)\n", rsrc);
+
+	if (strcmp(rsrc, RCM_RESOURCE_LINK_NEW) != 0) {
+		vnic_log_err(DATALINK_INVALID_LINKID, errorp,
+		    "unrecognized event");
+		errno = EINVAL;
+		return (RCM_FAILURE);
+	}
+
+	/* Update cache to reflect latest VNICs */
+	if (cache_update(hd) < 0) {
+		vnic_log_err(DATALINK_INVALID_LINKID, errorp,
+		    "private Cache update failed");
+		return (RCM_FAILURE);
+	}
+
+	/*
+	 * Try best to recover all configuration.
+	 */
+	rcm_log_message(RCM_DEBUG, "VNIC: process_nvlist\n");
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		if (strcmp(nvpair_name(nvp), RCM_NV_LINKID) != 0)
+			continue;
+
+		if (nvpair_value_uint64(nvp, &id64) != 0) {
+			vnic_log_err(DATALINK_INVALID_LINKID, errorp,
+			    "cannot get linkid");
+			rv = RCM_FAILURE;
+			continue;
+		}
+
+		linkid = (datalink_id_t)id64;
+		if (vnic_configure(hd, linkid) != 0) {
+			vnic_log_err(linkid, errorp, "configuring failed");
+			rv = RCM_FAILURE;
+			continue;
+		}
+
+		/* Notify all VNIC consumers */
+		if (vnic_consumer_notify(hd, linkid, errorp, flags,
+		    info) != 0) {
+			vnic_log_err(linkid, errorp, "consumer notify failed");
+			rv = RCM_FAILURE;
+		}
+	}
+
+	rcm_log_message(RCM_TRACE1,
+	    "VNIC: notify_event: link configuration complete\n");
+	return (rv);
+}
+
+/*
+ * vnic_usage - Determine the usage of a link.
+ *	    The returned buffer is owned by caller, and the caller
+ *	    must free it up when done.
+ */
+static char *
+vnic_usage(link_cache_t *node)
+{
+	dl_vnic_t *vnic;
+	int nvnic;
+	char *buf;
+	const char *fmt;
+	char *sep;
+	char errmsg[DLADM_STRSIZE];
+	char name[MAXLINKNAMELEN];
+	dladm_status_t status;
+	size_t bufsz;
+
+	rcm_log_message(RCM_TRACE2, "VNIC: usage(%s)\n", node->vc_resource);
+
+	assert(MUTEX_HELD(&cache_lock));
+	if ((status = dladm_datalink_id2info(node->vc_linkid, NULL, NULL, NULL,
+	    name, sizeof (name))) != DLADM_STATUS_OK) {
+		rcm_log_message(RCM_ERROR,
+		    _("VNIC: usage(%s) get link name failure(%s)\n"),
+		    node->vc_resource, dladm_status2str(status, errmsg));
+		return (NULL);
+	}
+
+	if (node->vc_state & CACHE_NODE_OFFLINED)
+		fmt = _("%1$s offlined");
+	else
+		fmt = _("%1$s VNICs: ");
+
+	/* TRANSLATION_NOTE: separator used between VNIC linkids */
+	sep = _(", ");
+
+	nvnic = 0;
+	for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next)
+		nvnic++;
+
+	/* space for VNICs and separators, plus message */
+	bufsz = nvnic * (MAXLINKNAMELEN + strlen(sep)) +
+	    strlen(fmt) + MAXLINKNAMELEN + 1;
+	if ((buf = malloc(bufsz)) == NULL) {
+		rcm_log_message(RCM_ERROR,
+		    _("VNIC: usage(%s) malloc failure(%s)\n"),
+		    node->vc_resource, strerror(errno));
+		return (NULL);
+	}
+	(void) snprintf(buf, bufsz, fmt, name);
+
+	if (node->vc_state & CACHE_NODE_OFFLINED) {
+		/* Nothing else to do */
+		rcm_log_message(RCM_TRACE2, "VNIC: usage (%s) info = %s\n",
+		    node->vc_resource, buf);
+		return (buf);
+	}
+
+	for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+		rcm_log_message(RCM_DEBUG, "VNIC:= %u\n", vnic->dlv_vnic_id);
+
+		if ((status = dladm_datalink_id2info(vnic->dlv_vnic_id, NULL,
+		    NULL, NULL, name, sizeof (name))) != DLADM_STATUS_OK) {
+			rcm_log_message(RCM_ERROR,
+			    _("VNIC: usage(%s) get vnic %u name failure(%s)\n"),
+			    node->vc_resource, vnic->dlv_vnic_id,
+			    dladm_status2str(status, errmsg));
+			free(buf);
+			return (NULL);
+		}
+
+		(void) strlcat(buf, name, bufsz);
+		if (vnic->dlv_next != NULL)
+			(void) strlcat(buf, sep, bufsz);
+	}
+
+	rcm_log_message(RCM_TRACE2, "VNIC: usage (%s) info = %s\n",
+	    node->vc_resource, buf);
+
+	return (buf);
+}
+
+/*
+ * Cache management routines, all cache management functions should be
+ * be called with cache_lock held.
+ */
+
+/*
+ * cache_lookup() - Get a cache node for a resource.
+ *		  Call with cache lock held.
+ *
+ * This ensures that the cache is consistent with the system state and
+ * returns a pointer to the cache element corresponding to the resource.
+ */
+static link_cache_t *
+cache_lookup(rcm_handle_t *hd, char *rsrc, char options)
+{
+	link_cache_t *node;
+
+	rcm_log_message(RCM_TRACE2, "VNIC: cache lookup(%s)\n", rsrc);
+
+	assert(MUTEX_HELD(&cache_lock));
+	if (options & CACHE_REFRESH) {
+		/* drop lock since update locks cache again */
+		(void) mutex_unlock(&cache_lock);
+		(void) cache_update(hd);
+		(void) mutex_lock(&cache_lock);
+	}
+
+	node = cache_head.vc_next;
+	for (; node != &cache_tail; node = node->vc_next) {
+		if (strcmp(rsrc, node->vc_resource) == 0) {
+			rcm_log_message(RCM_TRACE2,
+			    "VNIC: cache lookup succeeded(%s)\n", rsrc);
+			return (node);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * node_free - Free a node from the cache
+ */
+static void
+node_free(link_cache_t *node)
+{
+	dl_vnic_t *vnic, *next;
+
+	if (node != NULL) {
+		free(node->vc_resource);
+
+		/* free the VNIC list */
+		for (vnic = node->vc_vnic; vnic != NULL; vnic = next) {
+			next = vnic->dlv_next;
+			free(vnic);
+		}
+		free(node);
+	}
+}
+
+/*
+ * cache_insert - Insert a resource node in cache
+ */
+static void
+cache_insert(link_cache_t *node)
+{
+	assert(MUTEX_HELD(&cache_lock));
+
+	/* insert at the head for best performance */
+	node->vc_next = cache_head.vc_next;
+	node->vc_prev = &cache_head;
+
+	node->vc_next->vc_prev = node;
+	node->vc_prev->vc_next = node;
+}
+
+/*
+ * cache_remove() - Remove a resource node from cache.
+ */
+static void
+cache_remove(link_cache_t *node)
+{
+	assert(MUTEX_HELD(&cache_lock));
+	node->vc_next->vc_prev = node->vc_prev;
+	node->vc_prev->vc_next = node->vc_next;
+	node->vc_next = NULL;
+	node->vc_prev = NULL;
+}
+
+typedef struct vnic_update_arg_s {
+	rcm_handle_t	*hd;
+	int		retval;
+} vnic_update_arg_t;
+
+/*
+ * vnic_update() - Update physical interface properties
+ */
+static int
+vnic_update(datalink_id_t vnicid, void *arg)
+{
+	vnic_update_arg_t *vnic_update_argp = arg;
+	rcm_handle_t *hd = vnic_update_argp->hd;
+	link_cache_t *node;
+	dl_vnic_t *vnic;
+	char *rsrc;
+	dladm_vnic_attr_t vnic_attr;
+	dladm_status_t status;
+	char errmsg[DLADM_STRSIZE];
+	boolean_t newnode = B_FALSE;
+	int ret = -1;
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_update(%u)\n", vnicid);
+
+	assert(MUTEX_HELD(&cache_lock));
+	status = dladm_vnic_info(vnicid, &vnic_attr, DLADM_OPT_ACTIVE);
+	if (status != DLADM_STATUS_OK) {
+		rcm_log_message(RCM_TRACE1,
+		    "VNIC: vnic_update() cannot get vnic information for "
+		    "%u(%s)\n", vnicid, dladm_status2str(status, errmsg));
+		return (DLADM_WALK_CONTINUE);
+	}
+
+	if (vnic_attr.va_link_id == DATALINK_INVALID_LINKID) {
+		/*
+		 * Skip the etherstubs.
+		 */
+		rcm_log_message(RCM_TRACE1,
+		    "VNIC: vnic_update(): skip the etherstub %u\n", vnicid);
+		return (DLADM_WALK_CONTINUE);
+	}
+
+	rsrc = malloc(RCM_LINK_RESOURCE_MAX);
+	if (rsrc == NULL) {
+		rcm_log_message(RCM_ERROR, _("VNIC: malloc error(%s): %u\n"),
+		    strerror(errno), vnicid);
+		goto done;
+	}
+
+	(void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u",
+	    RCM_LINK_PREFIX, vnic_attr.va_link_id);
+
+	node = cache_lookup(hd, rsrc, CACHE_NO_REFRESH);
+	if (node != NULL) {
+		rcm_log_message(RCM_DEBUG,
+		    "VNIC: %s already registered (vnicid:%d)\n",
+		    rsrc, vnic_attr.va_vnic_id);
+		free(rsrc);
+	} else {
+		rcm_log_message(RCM_DEBUG,
+		    "VNIC: %s is a new resource (vnicid:%d)\n",
+		    rsrc, vnic_attr.va_vnic_id);
+		if ((node = calloc(1, sizeof (link_cache_t))) == NULL) {
+			free(rsrc);
+			rcm_log_message(RCM_ERROR, _("VNIC: calloc: %s\n"),
+			    strerror(errno));
+			goto done;
+		}
+
+		node->vc_resource = rsrc;
+		node->vc_vnic = NULL;
+		node->vc_linkid = vnic_attr.va_link_id;
+		node->vc_state |= CACHE_NODE_NEW;
+		newnode = B_TRUE;
+	}
+
+	for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+		if (vnic->dlv_vnic_id == vnicid) {
+			vnic->dlv_flags &= ~VNIC_STALE;
+			break;
+		}
+	}
+
+	if (vnic == NULL) {
+		if ((vnic = calloc(1, sizeof (dl_vnic_t))) == NULL) {
+			rcm_log_message(RCM_ERROR, _("VNIC: malloc: %s\n"),
+			    strerror(errno));
+			if (newnode) {
+				free(rsrc);
+				free(node);
+			}
+			goto done;
+		}
+		vnic->dlv_vnic_id = vnicid;
+		vnic->dlv_next = node->vc_vnic;
+		vnic->dlv_prev = NULL;
+		if (node->vc_vnic != NULL)
+			node->vc_vnic->dlv_prev = vnic;
+		node->vc_vnic = vnic;
+	}
+
+	node->vc_state &= ~CACHE_NODE_STALE;
+
+	if (newnode)
+		cache_insert(node);
+
+	rcm_log_message(RCM_TRACE3, "VNIC: vnic_update: succeeded(%u)\n",
+	    vnicid);
+	ret = 0;
+done:
+	vnic_update_argp->retval = ret;
+	return (ret == 0 ? DLADM_WALK_CONTINUE : DLADM_WALK_TERMINATE);
+}
+
+/*
+ * vnic_update_all() - Determine all VNIC links in the system
+ */
+static int
+vnic_update_all(rcm_handle_t *hd)
+{
+	vnic_update_arg_t arg = {NULL, 0};
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_update_all\n");
+
+	assert(MUTEX_HELD(&cache_lock));
+	arg.hd = hd;
+	(void) dladm_walk_datalink_id(vnic_update, &arg, DATALINK_CLASS_VNIC,
+	    DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE);
+	return (arg.retval);
+}
+
+/*
+ * cache_update() - Update cache with latest interface info
+ */
+static int
+cache_update(rcm_handle_t *hd)
+{
+	link_cache_t *node, *nnode;
+	dl_vnic_t *vnic;
+	int rv;
+
+	rcm_log_message(RCM_TRACE2, "VNIC: cache_update\n");
+
+	(void) mutex_lock(&cache_lock);
+
+	/* first we walk the entire cache, marking each entry stale */
+	node = cache_head.vc_next;
+	for (; node != &cache_tail; node = node->vc_next) {
+		node->vc_state |= CACHE_NODE_STALE;
+		for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next)
+			vnic->dlv_flags |= VNIC_STALE;
+	}
+
+	rv = vnic_update_all(hd);
+
+	/*
+	 * Continue to delete all stale nodes from the cache even
+	 * vnic_update_all() failed. Unregister link that are not offlined
+	 * and still in cache
+	 */
+	for (node = cache_head.vc_next; node != &cache_tail; node = nnode) {
+		dl_vnic_t *vnic, *next;
+
+		for (vnic = node->vc_vnic; vnic != NULL; vnic = next) {
+			next = vnic->dlv_next;
+
+			/* clear stale VNICs */
+			if (vnic->dlv_flags & VNIC_STALE) {
+				if (vnic->dlv_prev != NULL)
+					vnic->dlv_prev->dlv_next = next;
+				else
+					node->vc_vnic = next;
+
+				if (next != NULL)
+					next->dlv_prev = vnic->dlv_prev;
+				free(vnic);
+			}
+		}
+
+		nnode = node->vc_next;
+		if (node->vc_state & CACHE_NODE_STALE) {
+			(void) rcm_unregister_interest(hd, node->vc_resource,
+			    0);
+			rcm_log_message(RCM_DEBUG, "VNIC: unregistered %s\n",
+			    node->vc_resource);
+			assert(node->vc_vnic == NULL);
+			cache_remove(node);
+			node_free(node);
+			continue;
+		}
+
+		if (!(node->vc_state & CACHE_NODE_NEW))
+			continue;
+
+		if (rcm_register_interest(hd, node->vc_resource, 0, NULL) !=
+		    RCM_SUCCESS) {
+			rcm_log_message(RCM_ERROR,
+			    _("VNIC: failed to register %s\n"),
+			    node->vc_resource);
+			rv = -1;
+		} else {
+			rcm_log_message(RCM_DEBUG, "VNIC: registered %s\n",
+			    node->vc_resource);
+			node->vc_state &= ~CACHE_NODE_NEW;
+		}
+	}
+
+	(void) mutex_unlock(&cache_lock);
+	return (rv);
+}
+
+/*
+ * cache_free() - Empty the cache
+ */
+static void
+cache_free()
+{
+	link_cache_t *node;
+
+	rcm_log_message(RCM_TRACE2, "VNIC: cache_free\n");
+
+	(void) mutex_lock(&cache_lock);
+	node = cache_head.vc_next;
+	while (node != &cache_tail) {
+		cache_remove(node);
+		node_free(node);
+		node = cache_head.vc_next;
+	}
+	(void) mutex_unlock(&cache_lock);
+}
+
+/*
+ * vnic_log_err() - RCM error log wrapper
+ */
+static void
+vnic_log_err(datalink_id_t linkid, char **errorp, char *errmsg)
+{
+	char link[MAXLINKNAMELEN];
+	char errstr[DLADM_STRSIZE];
+	dladm_status_t status;
+	int len;
+	const char *errfmt;
+	char *error;
+
+	link[0] = '\0';
+	if (linkid != DATALINK_INVALID_LINKID) {
+		char rsrc[RCM_LINK_RESOURCE_MAX];
+
+		(void) snprintf(rsrc, sizeof (rsrc), "%s/%u",
+		    RCM_LINK_PREFIX, linkid);
+
+		rcm_log_message(RCM_ERROR, _("VNIC: %s(%s)\n"), errmsg, rsrc);
+		if ((status = dladm_datalink_id2info(linkid, NULL, NULL,
+		    NULL, link, sizeof (link))) != DLADM_STATUS_OK) {
+			rcm_log_message(RCM_WARNING,
+			    _("VNIC: cannot get link name for (%s) %s\n"),
+			    rsrc, dladm_status2str(status, errstr));
+		}
+	} else {
+		rcm_log_message(RCM_ERROR, _("VNIC: %s\n"), errmsg);
+	}
+
+	errfmt = strlen(link) > 0 ? _("VNIC: %s(%s)") : _("VNIC: %s");
+	len = strlen(errfmt) + strlen(errmsg) + MAXLINKNAMELEN + 1;
+	if ((error = malloc(len)) != NULL) {
+		if (strlen(link) > 0)
+			(void) snprintf(error, len, errfmt, errmsg, link);
+		else
+			(void) snprintf(error, len, errfmt, errmsg);
+	}
+
+	if (errorp != NULL)
+		*errorp = error;
+}
+
+/*
+ * vnic_consumer_online()
+ *
+ *	Notify online to VNIC consumers.
+ */
+/* ARGSUSED */
+static void
+vnic_consumer_online(rcm_handle_t *hd, link_cache_t *node, char **errorp,
+    uint_t flags, rcm_info_t **info)
+{
+	dl_vnic_t *vnic;
+	char rsrc[RCM_LINK_RESOURCE_MAX];
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_online (%s)\n",
+	    node->vc_resource);
+
+	for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+		if (!(vnic->dlv_flags & VNIC_CONSUMER_OFFLINED))
+			continue;
+
+		(void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u",
+		    RCM_LINK_PREFIX, vnic->dlv_vnic_id);
+
+		if (rcm_notify_online(hd, rsrc, flags, info) == RCM_SUCCESS)
+			vnic->dlv_flags &= ~VNIC_CONSUMER_OFFLINED;
+	}
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_online done\n");
+}
+
+/*
+ * vnic_consumer_offline()
+ *
+ *	Offline VNIC consumers.
+ */
+static int
+vnic_consumer_offline(rcm_handle_t *hd, link_cache_t *node, char **errorp,
+    uint_t flags, rcm_info_t **info)
+{
+	dl_vnic_t *vnic;
+	char rsrc[RCM_LINK_RESOURCE_MAX];
+	int ret = RCM_SUCCESS;
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_offline (%s)\n",
+	    node->vc_resource);
+
+	for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+		(void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u",
+		    RCM_LINK_PREFIX, vnic->dlv_vnic_id);
+
+		ret = rcm_request_offline(hd, rsrc, flags, info);
+		if (ret != RCM_SUCCESS)
+			break;
+
+		vnic->dlv_flags |= VNIC_CONSUMER_OFFLINED;
+	}
+
+	if (vnic != NULL)
+		vnic_consumer_online(hd, node, errorp, flags, info);
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_offline done\n");
+	return (ret);
+}
+
+/*
+ * Send RCM_RESOURCE_LINK_NEW events to other modules about new VNICs.
+ * Return 0 on success, -1 on failure.
+ */
+static int
+vnic_notify_new_vnic(rcm_handle_t *hd, char *rsrc)
+{
+	link_cache_t *node;
+	dl_vnic_t *vnic;
+	nvlist_t *nvl = NULL;
+	uint64_t id;
+	int ret = -1;
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_notify_new_vnic (%s)\n", rsrc);
+
+	(void) mutex_lock(&cache_lock);
+	if ((node = cache_lookup(hd, rsrc, CACHE_REFRESH)) == NULL) {
+		(void) mutex_unlock(&cache_lock);
+		return (0);
+	}
+
+	if (nvlist_alloc(&nvl, 0, 0) != 0) {
+		(void) mutex_unlock(&cache_lock);
+		rcm_log_message(RCM_WARNING,
+		    _("VNIC: failed to allocate nvlist\n"));
+		goto done;
+	}
+
+	for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) {
+		rcm_log_message(RCM_TRACE2,
+		    "VNIC: vnic_notify_new_vnic add (%u)\n", vnic->dlv_vnic_id);
+
+		id = vnic->dlv_vnic_id;
+		if (nvlist_add_uint64(nvl, RCM_NV_LINKID, id) != 0) {
+			rcm_log_message(RCM_ERROR,
+			    _("VNIC: failed to construct nvlist\n"));
+			(void) mutex_unlock(&cache_lock);
+			goto done;
+		}
+	}
+	(void) mutex_unlock(&cache_lock);
+
+	if (rcm_notify_event(hd, RCM_RESOURCE_LINK_NEW, 0, nvl, NULL) !=
+	    RCM_SUCCESS) {
+		rcm_log_message(RCM_ERROR,
+		    _("VNIC: failed to notify %s event for %s\n"),
+		    RCM_RESOURCE_LINK_NEW, node->vc_resource);
+		goto done;
+	}
+
+	ret = 0;
+done:
+	if (nvl != NULL)
+		nvlist_free(nvl);
+	return (ret);
+}
+
+/*
+ * vnic_consumer_notify() - Notify consumers of VNICs coming back online.
+ */
+static int
+vnic_consumer_notify(rcm_handle_t *hd, datalink_id_t linkid, char **errorp,
+    uint_t flags, rcm_info_t **info)
+{
+	char rsrc[RCM_LINK_RESOURCE_MAX];
+	link_cache_t *node;
+
+	/* Check for the interface in the cache */
+	(void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u", RCM_LINK_PREFIX,
+	    linkid);
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_notify(%s)\n", rsrc);
+
+	/*
+	 * Inform IP consumers of the new link.
+	 */
+	if (vnic_notify_new_vnic(hd, rsrc) != 0) {
+		(void) mutex_lock(&cache_lock);
+		if ((node = cache_lookup(hd, rsrc, CACHE_NO_REFRESH)) != NULL) {
+			(void) vnic_offline_vnic(node, VNIC_STALE,
+			    CACHE_NODE_STALE);
+		}
+		(void) mutex_unlock(&cache_lock);
+		rcm_log_message(RCM_TRACE2,
+		    "VNIC: vnic_notify_new_vnic failed(%s)\n", rsrc);
+		return (-1);
+	}
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_notify succeeded\n");
+	return (0);
+}
+
+typedef struct vnic_up_arg_s {
+	datalink_id_t	linkid;
+	int		retval;
+} vnic_up_arg_t;
+
+static int
+vnic_up(datalink_id_t vnicid, void *arg)
+{
+	vnic_up_arg_t *vnic_up_argp = arg;
+	dladm_status_t status;
+	dladm_vnic_attr_t vnic_attr;
+	char errmsg[DLADM_STRSIZE];
+
+	status = dladm_vnic_info(vnicid, &vnic_attr, DLADM_OPT_PERSIST);
+	if (status != DLADM_STATUS_OK) {
+		rcm_log_message(RCM_TRACE1,
+		    "VNIC: vnic_up(): cannot get information for VNIC %u "
+		    "(%s)\n", vnicid, dladm_status2str(status, errmsg));
+		return (DLADM_WALK_CONTINUE);
+	}
+
+	if (vnic_attr.va_link_id != vnic_up_argp->linkid)
+		return (DLADM_WALK_CONTINUE);
+
+	rcm_log_message(RCM_TRACE3, "VNIC: vnic_up(%u)\n", vnicid);
+	if ((status = dladm_vnic_up(vnicid, 0)) == DLADM_STATUS_OK)
+		return (DLADM_WALK_CONTINUE);
+
+	/*
+	 * Prompt the warning message and continue to UP other VNICs.
+	 */
+	rcm_log_message(RCM_WARNING,
+	    _("VNIC: VNIC up failed (%u): %s\n"),
+	    vnicid, dladm_status2str(status, errmsg));
+
+	vnic_up_argp->retval = -1;
+	return (DLADM_WALK_CONTINUE);
+}
+
+/*
+ * vnic_configure() - Configure VNICs over a physical link after it attaches
+ */
+static int
+vnic_configure(rcm_handle_t *hd, datalink_id_t linkid)
+{
+	char rsrc[RCM_LINK_RESOURCE_MAX];
+	link_cache_t *node;
+	vnic_up_arg_t arg = {DATALINK_INVALID_LINKID, 0};
+
+	/* Check for the VNICs in the cache */
+	(void) snprintf(rsrc, sizeof (rsrc), "%s/%u", RCM_LINK_PREFIX, linkid);
+
+	rcm_log_message(RCM_TRACE2, "VNIC: vnic_configure(%s)\n", rsrc);
+
+	/* Check if the link is new or was previously offlined */
+	(void) mutex_lock(&cache_lock);
+	if (((node = cache_lookup(hd, rsrc, CACHE_REFRESH)) != NULL) &&
+	    (!(node->vc_state & CACHE_NODE_OFFLINED))) {
+		rcm_log_message(RCM_TRACE2,
+		    "VNIC: Skipping configured interface(%s)\n", rsrc);
+		(void) mutex_unlock(&cache_lock);
+		return (0);
+	}
+	(void) mutex_unlock(&cache_lock);
+
+	arg.linkid = linkid;
+	(void) dladm_walk_datalink_id(vnic_up, &arg, DATALINK_CLASS_VNIC,
+	    DATALINK_ANY_MEDIATYPE, DLADM_OPT_PERSIST);
+
+	if (arg.retval == 0) {
+		rcm_log_message(RCM_TRACE2,
+		    "VNIC: vnic_configure succeeded(%s)\n", rsrc);
+	}
+	return (arg.retval);
+}
diff --git a/usr/src/cmd/svc/milestone/net-physical b/usr/src/cmd/svc/milestone/net-physical
index bcee0c9818..8530806768 100644
--- a/usr/src/cmd/svc/milestone/net-physical
+++ b/usr/src/cmd/svc/milestone/net-physical
@@ -26,8 +26,6 @@
 # Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T.
 # All rights reserved.
 #
-#
-# ident	"%Z%%M%	%I%	%E% SMI"
 
 . /lib/svc/share/smf_include.sh
 . /lib/svc/share/net_include.sh
@@ -81,6 +79,14 @@ if smf_is_globalzone; then
 	/sbin/dladm up-aggr
 	/sbin/dladm up-vlan
 	/sbin/dladm init-secobj
+	#
+	# Bring up VNICs
+	#
+	/sbin/dladm up-vnic
+	#
+	# Create flows via flowadm.
+	#
+	/sbin/flowadm init-flow
 fi
 
 #
diff --git a/usr/src/cmd/svc/profile/generic_limited_net.xml b/usr/src/cmd/svc/profile/generic_limited_net.xml
index 449d06bf1e..5fed0e86bf 100644
--- a/usr/src/cmd/svc/profile/generic_limited_net.xml
+++ b/usr/src/cmd/svc/profile/generic_limited_net.xml
@@ -62,6 +62,7 @@
     <instance name='flow'    enabled='false'/>
     <instance name='process' enabled='false'/>
     <instance name='task'    enabled='false'/>
+    <instance name='net'    enabled='false'/>
   </service>
   <service name='system/hal' version='1' type='service'>
     <instance name='default' enabled='true'/>
diff --git a/usr/src/cmd/svc/profile/generic_open.xml b/usr/src/cmd/svc/profile/generic_open.xml
index 7d837f4b53..34b600cca1 100644
--- a/usr/src/cmd/svc/profile/generic_open.xml
+++ b/usr/src/cmd/svc/profile/generic_open.xml
@@ -59,6 +59,7 @@
     <instance name='flow'    enabled='false'/>
     <instance name='process' enabled='false'/>
     <instance name='task'    enabled='false'/>
+    <instance name='net'    enabled='false'/>
   </service>
   <service name='system/hal' version='1' type='service'>
     <instance name='default' enabled='true'/>
diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c
index 3869b370c1..46b2b5a958 100644
--- a/usr/src/cmd/truss/codes.c
+++ b/usr/src/cmd/truss/codes.c
@@ -84,6 +84,7 @@
 #include <sys/ptms.h>
 #include <sys/aggr.h>
 #include <sys/dld.h>
+#include <sys/vnic.h>
 #include <sys/fs/zfs.h>
 #include <inet/kssl/kssl.h>
 #include <sys/dkio.h>
@@ -844,18 +845,38 @@ const struct ioc {
 	{ (uint_t)DLDIOC_ATTR,		"DLDIOC_ATTR",		"dld_ioc_attr"},
 	{ (uint_t)DLDIOC_PHYS_ATTR,	"DLDIOC_PHYS_ATTR",
 		"dld_ioc_phys_attr"},
-	{ (uint_t)DLDIOC_VLAN_ATTR,	"DLDIOC_VLAN_ATTR",
-		"dld_ioc_vlan_attr"},
-	{ (uint_t)DLDIOC_CREATE_VLAN,	"DLDIOC_CREATE_VLAN",
-		"dld_ioc_create_vlan"},
-	{ (uint_t)DLDIOC_DELETE_VLAN,	"DLDIOC_DELETE_VLAN",
-		"dld_ioc_delete_vlan"},
-	{ (uint_t)DLDIOC_DOORSERVER,   "DLDIOC_DOORSERVER", "dld_ioc_door"},
-	{ (uint_t)DLDIOC_RENAME,   "DLDIOC_RENAME", "dld_ioc_rename"},
-	{ (uint_t)DLDIOC_SETMACPROP,   "DLDIOC_SETMACPROP",
+	{ (uint_t)DLDIOC_DOORSERVER,	"DLDIOC_DOORSERVER", "dld_ioc_door"},
+	{ (uint_t)DLDIOC_RENAME,	"DLDIOC_RENAME", "dld_ioc_rename"},
+	{ (uint_t)DLDIOC_SECOBJ_GET,		"DLDIOC_SECOBJ_GET",
+		"dld_ioc_secobj_get"},
+	{ (uint_t)DLDIOC_SECOBJ_SET,		"DLDIOC_SECOBJ_SET",
+		"dld_ioc_secobj_set"},
+	{ (uint_t)DLDIOC_SECOBJ_UNSET,		"DLDIOC_SECOBJ_UNSET",
+		"dld_ioc_secobj_unset"},
+	{ (uint_t)DLDIOC_MACADDRGET,		"DLDIOC_MACADDRGET",
+		"dld_ioc_macaddrget"},
+	{ (uint_t)DLDIOC_SETMACPROP,		"DLDIOC_SETMACPROP",
 		"dld_ioc_macprop_s"},
-	{ (uint_t)DLDIOC_GETMACPROP,   "DLDIOC_GETMACPROP",
+	{ (uint_t)DLDIOC_GETMACPROP,		"DLDIOC_GETMACPROP",
 		"dld_ioc_macprop_s"},
+	{ (uint_t)DLDIOC_ADDFLOW,		"DLDIOC_ADDFLOW",
+		"dld_ioc_addflow"},
+	{ (uint_t)DLDIOC_REMOVEFLOW,		"DLDIOC_REMOVEFLOW",
+		"dld_ioc_removeflow"},
+	{ (uint_t)DLDIOC_MODIFYFLOW,		"DLDIOC_MODIFYFLOW",
+		"dld_ioc_modifyflow"},
+	{ (uint_t)DLDIOC_WALKFLOW,		"DLDIOC_WALKFLOW",
+		"dld_ioc_walkflow"},
+	{ (uint_t)DLDIOC_USAGELOG,		"DLDIOC_USAGELOG",
+		"dld_ioc_usagelog"},
+
+	/* vnic ioctls */
+	{ (uint_t)VNIC_IOC_CREATE,		"VNIC_IOC_CREATE",
+		"vnic_ioc_create"},
+	{ (uint_t)VNIC_IOC_DELETE,		"VNIC_IOC_DELETE",
+		"vnic_ioc_delete"},
+	{ (uint_t)VNIC_IOC_INFO,		"VNIC_IOC_INFO",
+		"vnic_ioc_info"},
 
 	/* ZFS ioctls */
 	{ (uint_t)ZFS_IOC_POOL_CREATE,		"ZFS_IOC_POOL_CREATE",
diff --git a/usr/src/cmd/vna/Makefile b/usr/src/cmd/vna/Makefile
index 4e5e25e85b..6b608e0126 100644
--- a/usr/src/cmd/vna/Makefile
+++ b/usr/src/cmd/vna/Makefile
@@ -22,15 +22,16 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
 #
 
 PROG		= vna
 
 include		../Makefile.cmd
 
+LDLIBS		+= -L$(ROOT)/lib
 LDLIBS		+= -ldladm -lsocket -ldlpi
 
+
 .KEEP_STATE:
 
 all:		$(PROG)
diff --git a/usr/src/cmd/vna/vna.c b/usr/src/cmd/vna/vna.c
index 6262de5959..6a05cf1777 100644
--- a/usr/src/cmd/vna/vna.c
+++ b/usr/src/cmd/vna/vna.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This utility constitutes a private interface - it will be removed
  * in a future release of Solaris.  Neither users nor other software
@@ -40,7 +38,7 @@
 #include <libdlpi.h>
 
 typedef struct vnic_attr {
-	dladm_vnic_attr_sys_t attr;
+	dladm_vnic_attr_t attr;
 	char *name;
 } vnic_attr_t;
 
@@ -48,7 +46,7 @@ typedef struct vnic_attr {
 static int
 v_print(datalink_id_t vnic_id, void *arg)
 {
-	dladm_vnic_attr_sys_t attr;
+	dladm_vnic_attr_t attr;
 	char vnic[MAXLINKNAMELEN];
 	char link[MAXLINKNAMELEN];
 
@@ -87,8 +85,8 @@ static int
 v_find(datalink_id_t vnic_id, void *arg)
 {
 	vnic_attr_t *vattr = arg;
-	dladm_vnic_attr_sys_t *specp = &vattr->attr;
-	dladm_vnic_attr_sys_t attr;
+	dladm_vnic_attr_t *specp = &vattr->attr;
+	dladm_vnic_attr_t attr;
 	char linkname[MAXLINKNAMELEN];
 
 	if (dladm_vnic_info(vnic_id, &attr, DLADM_OPT_ACTIVE) !=
@@ -221,7 +219,8 @@ v_add(char *link, char *addr, char *name)
 		 */
 		status = dladm_vnic_create(name, linkid,
 		    VNIC_MAC_ADDR_TYPE_FIXED, (uchar_t *)ea->ether_addr_octet,
-		    ETHERADDRL, &vnic_id, DLADM_OPT_ACTIVE);
+		    ETHERADDRL, NULL, 0, 0, &vnic_id, NULL, DLADM_OPT_ACTIVE);
+
 		if (status != DLADM_STATUS_OK) {
 			(void) fprintf(stderr, "dladm_vnic_create: %s\n",
 			    dladm_status2str(status, buf));
diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile
index b7a9d26795..8b3fb0aaf9 100644
--- a/usr/src/lib/Makefile
+++ b/usr/src/lib/Makefile
@@ -546,7 +546,7 @@ libdevinfo:	libnvpair libsec
 libdhcpagent:	libsocket libdhcputil libuuid libdlpi
 libdhcpsvc:	libinetutil
 libdhcputil:	libnsl libgen libinetutil libdlpi
-libdladm:	libdevinfo libinetutil libsocket
+libdladm:	libdevinfo libinetutil libsocket libnsl libexacct libscf
 libdll: 	libast
 libdlpi:	libinetutil libdladm
 libdscfg:	libnsctl libunistat libsocket libnsl
diff --git a/usr/src/lib/libdladm/Makefile b/usr/src/lib/libdladm/Makefile
index 630a7e2e19..ebe6c51eee 100644
--- a/usr/src/lib/libdladm/Makefile
+++ b/usr/src/lib/libdladm/Makefile
@@ -22,14 +22,14 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
 #
 
 include $(SRC)/lib/Makefile.lib
 
 HDRS =		libdladm.h libdladm_impl.h libdllink.h libdlaggr.h	\
 		libdlwlan.h libdlwlan_impl.h libdlvnic.h libdlvlan.h	\
-		libdlmgmt.h
+		libdlmgmt.h libdlflow.h libdlflow_impl.h libdlstat.h
+
 HDRDIR =	common
 
 SUBDIRS =	$(MACH)
@@ -39,7 +39,11 @@ POFILE =	libdladm.po
 MSGFILES =	common/libdladm.c common/linkprop.c common/secobj.c	\
 		common/libdllink.c common/libdlaggr.c	\
 		common/libdlwlan.c common/libdlvnic.c	\
-		common/libdlvlan.c common/libdlmgmt.c
+		common/libdlvlan.c common/libdlmgmt.c	\
+		common/flowattr.c common/flowprop.c	\
+		common/propfuncs.c common/libdlflow.c \
+		common/libdlstat.c common/flowattr.c
+
 XGETFLAGS =     -a -x libdladm.xcl
 
 all :=		TARGET = all
diff --git a/usr/src/lib/libdladm/Makefile.com b/usr/src/lib/libdladm/Makefile.com
index 0f6419bd29..50aa57e710 100644
--- a/usr/src/lib/libdladm/Makefile.com
+++ b/usr/src/lib/libdladm/Makefile.com
@@ -22,13 +22,13 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
-#
 
 LIBRARY = libdladm.a
 VERS    = .1
 OBJECTS = libdladm.o secobj.o linkprop.o libdllink.o libdlaggr.o \
-	libdlwlan.o libdlvnic.o libdlmgmt.o libdlvlan.o
+	libdlwlan.o libdlvnic.o libdlmgmt.o libdlvlan.o \
+	flowattr.o flowprop.o propfuncs.o libdlflow.o libdlstat.o \
+	usage.o
 
 include ../../Makefile.lib
 
@@ -36,8 +36,8 @@ include ../../Makefile.lib
 include ../../Makefile.rootfs
 
 LIBS =		$(DYNLIB) $(LINTLIB)
-LDLIBS +=	-ldevinfo -lc -linetutil -lsocket -lscf -lrcm \
-		-lnvpair -lkstat 
+LDLIBS +=	-ldevinfo -lc -linetutil -lsocket -lscf -lrcm -lnvpair \
+		-lexacct -lnsl -lkstat -lcurses
 
 SRCDIR =	../common
 $(LINTLIB) :=	SRCS = $(SRCDIR)/$(LINTSRC)
diff --git a/usr/src/lib/libdladm/common/flowattr.c b/usr/src/lib/libdladm/common/flowattr.c
new file mode 100644
index 0000000000..4fb578e5bc
--- /dev/null
+++ b/usr/src/lib/libdladm/common/flowattr.c
@@ -0,0 +1,411 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/mac_flow.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <net/if_types.h>
+#include <net/if_dl.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+
+#include <libdladm.h>
+#include <libdlflow.h>
+#include <libdlflow_impl.h>
+
+#define	V4_PART_OF_V6(v6)	((v6)._S6_un._S6_u32[3])
+
+/* max port number for UDP, TCP & SCTP */
+#define	MAX_PORT	65535
+
+static fad_checkf_t do_check_local_ip;
+static fad_checkf_t do_check_remote_ip;
+static fad_checkf_t do_check_protocol;
+static fad_checkf_t do_check_local_port;
+
+static dladm_status_t do_check_port(char *, boolean_t, flow_desc_t *);
+
+static fattr_desc_t	attr_table[] = {
+	{ "local_ip",	do_check_local_ip },
+	{ "remote_ip",	do_check_remote_ip },
+	{ "transport",	do_check_protocol },
+	{ "local_port",	do_check_local_port },
+	{ "dsfield",	do_check_dsfield },
+};
+
+#define	DLADM_MAX_FLOWATTRS	(sizeof (attr_table) / sizeof (fattr_desc_t))
+
+static dladm_status_t
+do_check_local_ip(char *attr_val, flow_desc_t *fdesc)
+{
+	return (do_check_ip_addr(attr_val, B_TRUE, fdesc));
+}
+
+static dladm_status_t
+do_check_remote_ip(char *attr_val, flow_desc_t *fdesc)
+{
+	return (do_check_ip_addr(attr_val, B_FALSE, fdesc));
+}
+
+dladm_status_t
+do_check_ip_addr(char *addr_str, boolean_t local, flow_desc_t *fd)
+{
+	struct addrinfo	*info = NULL;
+	dladm_status_t	status;
+	int		err, prefix_max, prefix_len = 0;
+	char		*prefix_str, *endp = NULL;
+	flow_mask_t	mask;
+	in6_addr_t	*addr;
+	uchar_t		*netmask;
+
+	if ((prefix_str = strchr(addr_str, '/')) != NULL) {
+		*prefix_str++ = '\0';
+		errno = 0;
+		prefix_len = (int)strtol(prefix_str, &endp, 10);
+		if (errno != 0 || prefix_len == 0 || *endp != '\0')
+			return (DLADM_STATUS_INVALID_PREFIXLEN);
+	}
+
+	err = getaddrinfo(addr_str, NULL, NULL, &info);
+	if (err != 0)
+		return (DLADM_STATUS_INVALID_IP);
+
+	mask = FLOW_IP_VERSION;
+	if (local) {
+		mask |= FLOW_IP_LOCAL;
+		addr = &fd->fd_local_addr;
+		netmask = (uchar_t *)&fd->fd_local_netmask;
+	} else {
+		mask |= FLOW_IP_REMOTE;
+		addr = &fd->fd_remote_addr;
+		netmask = (uchar_t *)&fd->fd_remote_netmask;
+	}
+
+	if (info->ai_family == AF_INET) {
+		IN6_INADDR_TO_V4MAPPED(&(((struct sockaddr_in *)
+		    (void *)info->ai_addr)->sin_addr), addr);
+		prefix_max = IP_ABITS;
+		fd->fd_ipversion = IPV4_VERSION;
+		netmask = (uchar_t *)
+		    &(V4_PART_OF_V6((*((in6_addr_t *)(void *)netmask))));
+	} else if (info->ai_family == AF_INET6) {
+		*addr = ((struct sockaddr_in6 *)
+		    (void *)info->ai_addr)->sin6_addr;
+		prefix_max = IPV6_ABITS;
+		fd->fd_ipversion = IPV6_VERSION;
+	} else {
+		freeaddrinfo(info);
+		return (DLADM_STATUS_INVALID_IP);
+	}
+
+	if (prefix_len == 0)
+		prefix_len = prefix_max;
+
+	status = dladm_prefixlen2mask(prefix_len, prefix_max, netmask);
+
+	if (status != DLADM_STATUS_OK) {
+		freeaddrinfo(info);
+		return (DLADM_STATUS_INVALID_PREFIXLEN);
+	}
+
+	fd->fd_mask |= mask;
+	freeaddrinfo(info);
+	return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+do_check_protocol(char *attr_val, flow_desc_t *fdesc)
+{
+	uint8_t	protocol;
+
+	protocol = dladm_str2proto(attr_val);
+
+	if (protocol != 0) {
+		fdesc->fd_mask |= FLOW_IP_PROTOCOL;
+		fdesc->fd_protocol = protocol;
+		return (DLADM_STATUS_OK);
+	} else {
+		return (DLADM_STATUS_INVALID_PROTOCOL);
+	}
+}
+
+dladm_status_t
+do_check_local_port(char *attr_val, flow_desc_t *fdesc)
+{
+	return (do_check_port(attr_val, B_TRUE, fdesc));
+}
+
+dladm_status_t
+do_check_port(char *attr_val, boolean_t local, flow_desc_t *fdesc)
+{
+	char	*endp = NULL;
+	long	val;
+
+	if (local) {
+		fdesc->fd_mask |= FLOW_ULP_PORT_LOCAL;
+		val = strtol(attr_val, &endp, 10);
+		if (val < 1 || val > MAX_PORT)
+			return (DLADM_STATUS_INVALID_PORT);
+		fdesc->fd_local_port = htons((uint16_t)val);
+	} else {
+		return (DLADM_STATUS_BADVAL);
+	}
+
+	return (DLADM_STATUS_OK);
+}
+
+/*
+ * Check for invalid and/or duplicate attribute specification
+ */
+static dladm_status_t
+flow_attrlist_check(dladm_arg_list_t *attrlist)
+{
+	int		i, j;
+	boolean_t	isset[DLADM_MAX_FLOWATTRS];
+	boolean_t	matched;
+
+	for (j = 0; j < DLADM_MAX_FLOWATTRS; j++)
+		isset[j] = B_FALSE;
+
+	for (i = 0; i < attrlist->al_count; i++) {
+		matched = B_FALSE;
+		for (j = 0; j < DLADM_MAX_FLOWATTRS; j++) {
+			if (strcmp(attrlist->al_info[i].ai_name,
+			    attr_table[j].ad_name) == 0) {
+				if (isset[j])
+					return (DLADM_STATUS_FLOW_INCOMPATIBLE);
+				else
+					isset[j] = B_TRUE;
+				matched = B_TRUE;
+			}
+		}
+		/*
+		 * if the attribute did not match any of the attribute in
+		 * attr_table, then it's an invalid attribute.
+		 */
+		if (!matched)
+			return (DLADM_STATUS_BADARG);
+	}
+	return (DLADM_STATUS_OK);
+}
+
+/*
+ * Convert an attribute list to a flow_desc_t using the attribute ad_check()
+ * functions.
+ */
+dladm_status_t
+dladm_flow_attrlist_extract(dladm_arg_list_t *attrlist, flow_desc_t *flowdesc)
+{
+	dladm_status_t	status = DLADM_STATUS_BADARG;
+	int		i;
+
+	for (i = 0; i < attrlist->al_count; i++) {
+		dladm_arg_info_t	*aip = &attrlist->al_info[i];
+		int			j;
+
+		for (j = 0; j < DLADM_MAX_FLOWATTRS; j++) {
+			fattr_desc_t	*adp = &attr_table[j];
+
+			if (strcasecmp(aip->ai_name, adp->ad_name) != 0)
+				continue;
+
+			if ((aip->ai_val == NULL) || (*aip->ai_val == NULL))
+				return (DLADM_STATUS_BADARG);
+
+			if (adp->ad_check != NULL)
+				status = adp->ad_check(*aip->ai_val, flowdesc);
+			else
+				status = DLADM_STATUS_BADARG;
+
+			if (status != DLADM_STATUS_OK)
+				return (status);
+		}
+	}
+	return (status);
+}
+
+void
+dladm_free_attrs(dladm_arg_list_t *list)
+{
+	dladm_free_args(list);
+}
+
+dladm_status_t
+dladm_parse_flow_attrs(char *str, dladm_arg_list_t **listp, boolean_t novalues)
+{
+
+	if (dladm_parse_args(str, listp, novalues)
+	    != DLADM_STATUS_OK)
+		return (DLADM_STATUS_ATTR_PARSE_ERR);
+
+	if (flow_attrlist_check(*listp) != DLADM_STATUS_OK) {
+		dladm_free_attrs(*listp);
+		return (DLADM_STATUS_ATTR_PARSE_ERR);
+	}
+
+	return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+do_check_dsfield(char *str, flow_desc_t *fd)
+{
+	char		*mask_str, *endp = NULL;
+	uint_t		mask = 0xff, value;
+
+	if ((mask_str = strchr(str, ':')) != NULL) {
+		*mask_str++ = '\0';
+		errno = 0;
+		mask = strtoul(mask_str, &endp, 16);
+		if (errno != 0 || mask == 0 || mask > 0xff ||
+		    *endp != '\0')
+			return (DLADM_STATUS_INVALID_DSFMASK);
+	}
+	errno = 0;
+	endp = NULL;
+	value = strtoul(str, &endp, 16);
+	if (errno != 0 || value == 0 || value > 0xff || *endp != '\0')
+		return (DLADM_STATUS_INVALID_DSF);
+
+	fd->fd_dsfield = (uint8_t)value;
+	fd->fd_dsfield_mask = (uint8_t)mask;
+	fd->fd_mask |= FLOW_IP_DSFIELD;
+	return (DLADM_STATUS_OK);
+}
+
+char *
+dladm_proto2str(uint8_t protocol)
+{
+	if (protocol == IPPROTO_TCP)
+		return ("tcp");
+	if (protocol == IPPROTO_UDP)
+		return ("udp");
+	if (protocol == IPPROTO_SCTP)
+		return ("sctp");
+	if (protocol == IPPROTO_ICMPV6)
+		return ("icmpv6");
+	if (protocol == IPPROTO_ICMP)
+		return ("icmp");
+	else
+		return ("");
+}
+
+uint8_t
+dladm_str2proto(const char *protostr)
+{
+	if (strncasecmp(protostr, "tcp", 3) == 0)
+		return (IPPROTO_TCP);
+	else if (strncasecmp(protostr, "udp", 3) == 0)
+		return (IPPROTO_UDP);
+	else if (strncasecmp(protostr, "sctp", 4) == 0)
+		return (IPPROTO_SCTP);
+	else if (strncasecmp(protostr, "icmpv6", 6) == 0)
+		return (IPPROTO_ICMPV6);
+	else if (strncasecmp(protostr, "icmp", 4) == 0)
+		return (IPPROTO_ICMP);
+
+	return (0);
+}
+
+void
+dladm_flow_attr_ip2str(dladm_flow_attr_t *attrp, char *buf, size_t buf_len)
+{
+	flow_desc_t	fdesc = attrp->fa_flow_desc;
+	struct in_addr	ipaddr;
+	int		prefix_len, prefix_max;
+	char		*cp, abuf[INET6_ADDRSTRLEN];
+
+	if (fdesc.fd_mask & FLOW_IP_LOCAL) {
+		if (fdesc.fd_ipversion == IPV6_VERSION) {
+			(void) inet_ntop(AF_INET6, &fdesc.fd_local_addr, abuf,
+			    INET6_ADDRSTRLEN);
+			cp = abuf;
+			prefix_max = IPV6_ABITS;
+		} else {
+			ipaddr.s_addr = fdesc.fd_local_addr._S6_un._S6_u32[3];
+			cp = inet_ntoa(ipaddr);
+			prefix_max = IP_ABITS;
+		}
+		(void) dladm_mask2prefixlen(&fdesc.fd_local_netmask,
+		    prefix_max, &prefix_len);
+		(void) snprintf(buf, buf_len, "LCL:%s/%d  ", cp, prefix_len);
+	} else if (fdesc.fd_mask & FLOW_IP_REMOTE) {
+		if (fdesc.fd_ipversion == IPV6_VERSION) {
+			(void) inet_ntop(AF_INET6, &fdesc.fd_remote_addr, abuf,
+			    INET6_ADDRSTRLEN);
+			cp = abuf;
+			prefix_max = IPV6_ABITS;
+		} else {
+			ipaddr.s_addr = fdesc.fd_remote_addr._S6_un._S6_u32[3];
+			cp = inet_ntoa(ipaddr);
+			prefix_max = IP_ABITS;
+		}
+		(void) dladm_mask2prefixlen(&fdesc.fd_remote_netmask,
+		    prefix_max, &prefix_len);
+		(void) snprintf(buf, buf_len, "RMT:%s/%d  ", cp, prefix_len);
+	} else {
+		buf[0] = '\0';
+	}
+}
+
+void
+dladm_flow_attr_proto2str(dladm_flow_attr_t *attrp, char *buf, size_t buf_len)
+{
+	flow_desc_t	fdesc = attrp->fa_flow_desc;
+
+	(void) snprintf(buf, buf_len, "%s",
+	    dladm_proto2str(fdesc.fd_protocol));
+}
+
+void
+dladm_flow_attr_port2str(dladm_flow_attr_t *attrp, char *buf, size_t buf_len)
+{
+	flow_desc_t	fdesc = attrp->fa_flow_desc;
+
+	if (fdesc.fd_mask & FLOW_ULP_PORT_LOCAL) {
+		(void) snprintf(buf, buf_len, "%d",
+		    ntohs(fdesc.fd_local_port));
+	} else {
+		buf[0] = '\0';
+	}
+}
+
+void
+dladm_flow_attr_dsfield2str(dladm_flow_attr_t *attrp, char *buf, size_t buf_len)
+{
+	flow_desc_t	fdesc = attrp->fa_flow_desc;
+
+	if (fdesc.fd_mask & FLOW_IP_DSFIELD) {
+		(void) snprintf(buf, buf_len, "0x%x:0x%x",
+		    fdesc.fd_dsfield, fdesc.fd_dsfield_mask);
+	} else {
+		buf[0] = '\0';
+	}
+}
diff --git a/usr/src/lib/libdladm/common/flowprop.c b/usr/src/lib/libdladm/common/flowprop.c
new file mode 100644
index 0000000000..a2125a9d33
--- /dev/null
+++ b/usr/src/lib/libdladm/common/flowprop.c
@@ -0,0 +1,611 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdlib.h>
+#include <strings.h>
+#include <errno.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/dld.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <libdevinfo.h>
+#include <libdladm_impl.h>
+#include <libdlflow.h>
+#include <libdlflow_impl.h>
+#include <libintl.h>
+
+#include <dlfcn.h>
+#include <link.h>
+
+/*
+ * XXX duplicate define
+ */
+#define	DLADM_PROP_VAL_MAX	32
+
+static dladm_status_t	i_dladm_set_flowprop_db(const char *, const char *,
+			    char **, uint_t);
+static dladm_status_t	i_dladm_get_flowprop_db(const char *, const char *,
+			    char **, uint_t *);
+
+static fpd_getf_t	do_get_maxbw;
+static fpd_setf_t	do_set_maxbw;
+static fpd_checkf_t	do_check_maxbw;
+
+static fpd_getf_t	do_get_priority;
+static fpd_setf_t	do_set_priority;
+static fpd_checkf_t	do_check_priority;
+
+static fprop_desc_t	prop_table[] = {
+	{ "maxbw",	{ "", NULL }, NULL, 0, B_FALSE,
+	    do_set_maxbw, NULL,
+	    do_get_maxbw, do_check_maxbw},
+	{ "priority",	{ "", NULL }, NULL, 0, B_FALSE,
+	    do_set_priority, NULL,
+	    do_get_priority, do_check_priority}
+};
+
+#define	DLADM_MAX_FLOWPROPS	(sizeof (prop_table) / sizeof (fprop_desc_t))
+
+static prop_table_t	prop_tbl = {
+	prop_table,
+	DLADM_MAX_FLOWPROPS
+};
+
+static resource_prop_t rsrc_prop_table[] = {
+	{"maxbw",	do_extract_maxbw},
+	{"priority",	do_extract_priority}
+};
+#define	DLADM_MAX_RSRC_PROP (sizeof (rsrc_prop_table) / \
+	sizeof (resource_prop_t))
+
+static dladm_status_t	flow_proplist_check(dladm_arg_list_t *);
+
+dladm_status_t
+dladm_set_flowprop(const char *flow, const char *prop_name, char **prop_val,
+    uint_t val_cnt, uint_t flags, char **errprop)
+{
+	dladm_status_t		status = DLADM_STATUS_BADARG;
+
+	if (flow == NULL || (prop_val == NULL && val_cnt > 0) ||
+	    (prop_val != NULL && val_cnt == 0) || flags == 0)
+		return (DLADM_STATUS_BADARG);
+
+	if ((flags & DLADM_OPT_ACTIVE) != 0) {
+		status = i_dladm_set_prop_temp(flow, prop_name, prop_val,
+		    val_cnt, flags, errprop, &prop_tbl);
+		if (status == DLADM_STATUS_TEMPONLY &&
+		    (flags & DLADM_OPT_PERSIST) != 0)
+			return (DLADM_STATUS_TEMPONLY);
+		if (status != DLADM_STATUS_OK)
+			return (status);
+	}
+	if ((flags & DLADM_OPT_PERSIST) != 0) {
+		if (i_dladm_is_prop_temponly(prop_name, errprop, &prop_tbl))
+			return (DLADM_STATUS_TEMPONLY);
+
+		status = i_dladm_set_flowprop_db(flow, prop_name,
+		    prop_val, val_cnt);
+	}
+	return (status);
+}
+
+dladm_status_t
+dladm_walk_flowprop(int (*func)(void *, const char *), const char *flow,
+    void *arg)
+{
+	int	i;
+
+	if (flow == NULL || func == NULL)
+		return (DLADM_STATUS_BADARG);
+
+	/* Then show data-flow properties if there are any */
+	for (i = 0; i < DLADM_MAX_FLOWPROPS; i++) {
+		if (func(arg, prop_table[i].pd_name) != DLADM_WALK_CONTINUE)
+			break;
+	}
+	return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+dladm_get_flowprop(const char *flow, uint32_t type,
+    const char *prop_name, char **prop_val, uint_t *val_cntp)
+{
+	dladm_status_t status;
+
+	if (flow == NULL || prop_name == NULL || prop_val == NULL ||
+	    val_cntp == NULL || *val_cntp == 0)
+		return (DLADM_STATUS_BADARG);
+
+	if (type == DLADM_PROP_VAL_PERSISTENT) {
+		if (i_dladm_is_prop_temponly(prop_name, NULL, &prop_tbl))
+			return (DLADM_STATUS_TEMPONLY);
+		return (i_dladm_get_flowprop_db(flow, prop_name,
+		    prop_val, val_cntp));
+	}
+
+	status = i_dladm_get_prop_temp(flow, type, prop_name,
+	    prop_val, val_cntp, &prop_tbl);
+	if (status != DLADM_STATUS_NOTFOUND)
+		return (status);
+
+	return (DLADM_STATUS_BADARG);
+}
+
+#define	FLOWPROP_RW_DB(statep, writeop) \
+	(i_dladm_rw_db("/etc/dladm/flowprop.conf", \
+	S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH, process_prop_db, \
+	(statep), (writeop)))
+
+static dladm_status_t
+i_dladm_set_flowprop_db(const char *flow, const char *prop_name,
+    char **prop_val, uint_t val_cnt)
+{
+	prop_db_state_t	state;
+
+	state.ls_op = process_prop_set;
+	state.ls_name = flow;
+	state.ls_propname = prop_name;
+	state.ls_propval = prop_val;
+	state.ls_valcntp = &val_cnt;
+	state.ls_initop = NULL;
+
+	return (FLOWPROP_RW_DB(&state, B_TRUE));
+}
+
+static dladm_status_t
+i_dladm_get_flowprop_db(const char *flow, const char *prop_name,
+    char **prop_val, uint_t *val_cntp)
+{
+	prop_db_state_t	state;
+
+	state.ls_op = process_prop_get;
+	state.ls_name = flow;
+	state.ls_propname = prop_name;
+	state.ls_propval = prop_val;
+	state.ls_valcntp = val_cntp;
+	state.ls_initop = NULL;
+
+	return (FLOWPROP_RW_DB(&state, B_FALSE));
+}
+
+dladm_status_t
+i_dladm_init_flowprop_db(void)
+{
+	prop_db_state_t	state;
+
+	state.ls_op = process_prop_init;
+	state.ls_name = NULL;
+	state.ls_propname = NULL;
+	state.ls_propval = NULL;
+	state.ls_valcntp = NULL;
+	state.ls_initop = dladm_set_flowprop;
+
+	return (FLOWPROP_RW_DB(&state, B_FALSE));
+}
+
+#define	MIN_INFO_SIZE (4 * 1024)
+
+dladm_status_t
+dladm_flow_info(const char *flow, dladm_flow_attr_t *attr)
+{
+	dld_ioc_walkflow_t	*ioc;
+	int			bufsize, fd;
+	dld_flowinfo_t		*flowinfo;
+
+	if ((flow == NULL) || (attr == NULL))
+		return (DLADM_STATUS_BADARG);
+
+	if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
+		return (dladm_errno2status(errno));
+
+	bufsize = MIN_INFO_SIZE;
+	if ((ioc = calloc(1, bufsize)) == NULL) {
+		(void) close(fd);
+		return (dladm_errno2status(errno));
+	}
+
+	(void) strlcpy(ioc->wf_name, flow, sizeof (ioc->wf_name));
+	ioc->wf_len = bufsize - sizeof (*ioc);
+
+	while (ioctl(fd, DLDIOC_WALKFLOW, ioc) < 0) {
+		if (errno == ENOSPC) {
+			bufsize *= 2;
+			ioc = realloc(ioc, bufsize);
+			if (ioc != NULL) {
+				(void) strlcpy(ioc->wf_name, flow,
+				    MAXNAMELEN);
+				ioc->wf_len = bufsize - sizeof (*ioc);
+				continue;
+			}
+		}
+		free(ioc);
+		(void) close(fd);
+		return (dladm_errno2status(errno));
+	}
+
+	bzero(attr, sizeof (*attr));
+
+	flowinfo = (dld_flowinfo_t *)(void *)(ioc + 1);
+
+	attr->fa_linkid = flowinfo->fi_linkid;
+	bcopy(&flowinfo->fi_flowname, &attr->fa_flowname,
+	    sizeof (attr->fa_flowname));
+	bcopy(&flowinfo->fi_flow_desc, &attr->fa_flow_desc,
+	    sizeof (attr->fa_flow_desc));
+	bcopy(&flowinfo->fi_resource_props, &attr->fa_resource_props,
+	    sizeof (attr->fa_resource_props));
+
+	free(ioc);
+	(void) close(fd);
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_get_maxbw(const char *flow, char **prop_val, uint_t *val_cnt)
+{
+	mac_resource_props_t	*mrp;
+	char 			buf[DLADM_STRSIZE];
+	dladm_flow_attr_t	fa;
+	dladm_status_t		status;
+
+	status = dladm_flow_info(flow, &fa);
+	if (status != DLADM_STATUS_OK)
+		return (status);
+	mrp = &(fa.fa_resource_props);
+
+	*val_cnt = 1;
+	if (mrp->mrp_mask & MRP_MAXBW) {
+		(void) snprintf(prop_val[0], DLADM_STRSIZE, "%s",
+		    dladm_bw2str(mrp->mrp_maxbw, buf));
+	} else {
+		return (DLADM_STATUS_NOTSUP);
+	}
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_set_maxbw(const char *flow, val_desc_t *vdp, uint_t val_cnt)
+{
+	dld_ioc_modifyflow_t	attr;
+	int			fd;
+	mac_resource_props_t	mrp;
+	void			*val;
+
+	if (val_cnt != 1)
+		return (DLADM_STATUS_BADVALCNT);
+
+	bzero(&mrp, sizeof (mrp));
+	if (vdp != NULL && (val = (void *)vdp->vd_val) != NULL) {
+		bcopy(val, &mrp.mrp_maxbw, sizeof (int64_t));
+		free(val);
+	} else {
+		mrp.mrp_maxbw = MRP_MAXBW_RESETVAL;
+	}
+	mrp.mrp_mask = MRP_MAXBW;
+
+	bzero(&attr, sizeof (attr));
+	(void) strlcpy(attr.mf_name, flow, sizeof (attr.mf_name));
+	bcopy(&mrp, &attr.mf_resource_props, sizeof (mac_resource_props_t));
+
+	fd = open(DLD_CONTROL_DEV, O_RDWR);
+	if (fd < 0) {
+		return (dladm_errno2status(errno));
+	}
+
+	if (ioctl(fd, DLDIOC_MODIFYFLOW, &attr) < 0) {
+		(void) close(fd);
+		return (dladm_errno2status(errno));
+	}
+	(void) close(fd);
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_check_maxbw(fprop_desc_t *pdp, char **prop_val, uint_t val_cnt,
+    val_desc_t **vdpp)
+{
+	uint64_t	*maxbw;
+	val_desc_t	*vdp = NULL;
+	dladm_status_t	status = DLADM_STATUS_OK;
+
+	if (val_cnt != 1)
+		return (DLADM_STATUS_BADVALCNT);
+
+	maxbw = malloc(sizeof (uint64_t));
+	if (maxbw == NULL)
+		return (DLADM_STATUS_NOMEM);
+
+	status = dladm_str2bw(*prop_val, maxbw);
+	if (status != DLADM_STATUS_OK) {
+		free(maxbw);
+		return (status);
+	}
+
+	if ((*maxbw < MRP_MAXBW_MINVAL) && (*maxbw != 0)) {
+		free(maxbw);
+		return (DLADM_STATUS_MINMAXBW);
+	}
+
+	vdp = malloc(sizeof (val_desc_t));
+	if (vdp == NULL) {
+		free(maxbw);
+		return (DLADM_STATUS_NOMEM);
+	}
+
+	vdp->vd_val = (uintptr_t)maxbw;
+	*vdpp = vdp;
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_get_priority(const char *flow, char **prop_val, uint_t *val_cnt)
+{
+	mac_resource_props_t	*mrp;
+	char 			buf[DLADM_STRSIZE];
+	dladm_flow_attr_t	fa;
+	dladm_status_t		status;
+
+	bzero(&fa, sizeof (dladm_flow_attr_t));
+	status = dladm_flow_info(flow, &fa);
+	if (status != DLADM_STATUS_OK)
+		return (status);
+	mrp = &(fa.fa_resource_props);
+
+	*val_cnt = 1;
+	if (mrp->mrp_mask & MRP_PRIORITY) {
+		(void) snprintf(prop_val[0], DLADM_STRSIZE, "%s",
+		    dladm_pri2str(mrp->mrp_priority, buf));
+	} else {
+		return (DLADM_STATUS_NOTSUP);
+	}
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_set_priority(const char *flow, val_desc_t *vdp, uint_t val_cnt)
+{
+	dld_ioc_modifyflow_t	attr;
+	int			fd;
+	mac_resource_props_t	mrp;
+	void			*val;
+
+	if (val_cnt != 1)
+		return (DLADM_STATUS_BADVALCNT);
+
+	bzero(&mrp, sizeof (mrp));
+	if (vdp != NULL && (val = (void *)vdp->vd_val) != NULL) {
+		bcopy(val, &mrp.mrp_priority, sizeof (mac_priority_level_t));
+		free(val);
+	} else {
+		mrp.mrp_priority = MPL_RESET;
+	}
+	mrp.mrp_mask = MRP_PRIORITY;
+
+	bzero(&attr, sizeof (attr));
+	(void) strlcpy(attr.mf_name, flow, sizeof (attr.mf_name));
+	bcopy(&mrp, &attr.mf_resource_props, sizeof (mac_resource_props_t));
+
+	fd = open(DLD_CONTROL_DEV, O_RDWR);
+	if (fd < 0) {
+		return (dladm_errno2status(errno));
+	}
+
+	if (ioctl(fd, DLDIOC_MODIFYFLOW, &attr) < 0) {
+		(void) close(fd);
+		return (dladm_errno2status(errno));
+	}
+	(void) close(fd);
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_check_priority(fprop_desc_t *pdp, char **prop_val, uint_t val_cnt,
+    val_desc_t **vdpp)
+{
+	mac_priority_level_t	*pri;
+	val_desc_t	*vdp = NULL;
+	dladm_status_t	status = DLADM_STATUS_OK;
+
+	if (val_cnt != 1)
+		return (DLADM_STATUS_BADVALCNT);
+
+	pri = malloc(sizeof (mac_priority_level_t));
+	if (pri == NULL)
+		return (DLADM_STATUS_NOMEM);
+
+	status = dladm_str2pri(*prop_val, pri);
+	if (status != DLADM_STATUS_OK) {
+		free(pri);
+		return (status);
+	}
+
+	if (*pri == -1) {
+		free(pri);
+		return (DLADM_STATUS_BADVAL);
+	}
+
+	vdp = malloc(sizeof (val_desc_t));
+	if (vdp == NULL) {
+		free(pri);
+		return (DLADM_STATUS_NOMEM);
+	}
+
+	vdp->vd_val = (uintptr_t)pri;
+	*vdpp = vdp;
+	return (DLADM_STATUS_OK);
+}
+
+static dladm_status_t
+flow_proplist_check(dladm_arg_list_t *proplist)
+{
+	int		i, j;
+	boolean_t	matched;
+
+	for (i = 0; i < proplist->al_count; i++) {
+		matched = B_FALSE;
+		for (j = 0; j < DLADM_MAX_FLOWPROPS; j++) {
+			if (strcmp(proplist->al_info[i].ai_name,
+			    prop_table[j].pd_name) == 0)
+				matched = B_TRUE;
+			}
+		if (!matched)
+			return (DLADM_STATUS_BADPROP);
+	}
+	return (DLADM_STATUS_OK);
+
+}
+
+dladm_status_t
+dladm_parse_flow_props(char *str, dladm_arg_list_t **listp, boolean_t novalues)
+{
+	dladm_status_t	status;
+
+	status = dladm_parse_args(str, listp, novalues);
+	if (status != DLADM_STATUS_OK)
+		return (status);
+
+	status = flow_proplist_check(*listp);
+	if (status != DLADM_STATUS_OK) {
+		dladm_free_props(*listp);
+		return (status);
+	}
+
+	return (DLADM_STATUS_OK);
+}
+
+/*
+ * Retrieve the named property from a proplist, check the value and
+ * convert to a kernel structure.
+ */
+static dladm_status_t
+i_dladm_flow_proplist_extract_one(dladm_arg_list_t *proplist,
+    const char *name, void *val)
+{
+	dladm_status_t		status;
+	dladm_arg_info_t	*aip = NULL;
+	int			i, j;
+
+	/* Find named property in proplist */
+	for (i = 0; i < proplist->al_count; i++) {
+		aip = &proplist->al_info[i];
+		if (strcasecmp(aip->ai_name, name) == 0)
+			break;
+	}
+
+	/* Property not in list */
+	if (i == proplist->al_count)
+		return (DLADM_STATUS_OK);
+
+	for (i = 0; i < DLADM_MAX_FLOWPROPS; i++) {
+		fprop_desc_t	*pdp = &prop_table[i];
+		val_desc_t	*vdp;
+
+		vdp = malloc(sizeof (val_desc_t) * aip->ai_count);
+		if (vdp == NULL)
+			return (DLADM_STATUS_NOMEM);
+
+		if (strcasecmp(aip->ai_name, pdp->pd_name) != 0)
+			continue;
+
+		if (aip->ai_val == NULL)
+			return (DLADM_STATUS_BADARG);
+
+		/* Check property value */
+		if (pdp->pd_check != NULL) {
+			status = pdp->pd_check(pdp, aip->ai_val,
+			    aip->ai_count, &vdp);
+		} else {
+			status = DLADM_STATUS_BADARG;
+		}
+
+		if (status != DLADM_STATUS_OK)
+			return (status);
+
+		for (j = 0; j < DLADM_MAX_RSRC_PROP; j++) {
+			resource_prop_t	*rpp = &rsrc_prop_table[j];
+
+			if (strcasecmp(aip->ai_name, rpp->rp_name) != 0)
+				continue;
+
+			/* Extract kernel structure */
+			if (rpp->rp_extract != NULL) {
+				status = rpp->rp_extract(vdp, val,
+				    aip->ai_count);
+			} else {
+				status = DLADM_STATUS_BADARG;
+			}
+			break;
+		}
+
+		if (status != DLADM_STATUS_OK)
+			return (status);
+
+		break;
+	}
+	return (status);
+}
+
+/*
+ * Extract properties from a proplist and convert to mac_resource_props_t.
+ */
+dladm_status_t
+dladm_flow_proplist_extract(dladm_arg_list_t *proplist,
+    mac_resource_props_t *mrp)
+{
+	dladm_status_t	status = DLADM_STATUS_OK;
+
+	status = i_dladm_flow_proplist_extract_one(proplist, "maxbw", mrp);
+	if (status != DLADM_STATUS_OK)
+		return (status);
+	status = i_dladm_flow_proplist_extract_one(proplist, "priority", mrp);
+	if (status != DLADM_STATUS_OK)
+		return (status);
+	return (status);
+}
+
+dladm_status_t
+i_dladm_set_flow_proplist_db(char *flow, dladm_arg_list_t *proplist)
+{
+	dladm_status_t		status, ssave = DLADM_STATUS_OK;
+	dladm_arg_info_t	ai;
+	int			i;
+
+	for (i = 0; i < proplist->al_count; i++) {
+		ai = proplist->al_info[i];
+		status = i_dladm_set_flowprop_db(flow, ai.ai_name,
+		    ai.ai_val, ai.ai_count);
+		if (status != DLADM_STATUS_OK)
+			ssave = status;
+	}
+	return (ssave);
+}
diff --git a/usr/src/lib/libdladm/common/libdladm.c b/usr/src/lib/libdladm/common/libdladm.c
index fa588df066..cc6bf542f7 100644
--- a/usr/src/lib/libdladm/common/libdladm.c
+++ b/usr/src/lib/libdladm/common/libdladm.c
@@ -29,6 +29,7 @@
 #include <fcntl.h>
 #include <strings.h>
 #include <dirent.h>
+#include <stdlib.h>
 #include <sys/param.h>
 #include <sys/stat.h>
 #include <libdladm_impl.h>
@@ -89,7 +90,7 @@ dladm_status2str(dladm_status_t status, char *buf)
 		s = "I/O error";
 		break;
 	case DLADM_STATUS_TEMPONLY:
-		s = "change cannot be persistent, specify -t please";
+		s = "change cannot be persistent";
 		break;
 	case DLADM_STATUS_TIMEDOUT:
 		s = "operation timed out";
@@ -127,6 +128,117 @@ dladm_status2str(dladm_status_t status, char *buf)
 	case DLADM_STATUS_NONOTIF:
 		s = "link notification is not supported";
 		break;
+	case DLADM_STATUS_BADTIMEVAL:
+		s = "invalid time range";
+		break;
+	case DLADM_STATUS_INVALIDMACADDR:
+		s = "invalid MAC address value";
+		break;
+	case DLADM_STATUS_INVALIDMACADDRNIC:
+		s = "MAC address reserved for use by underlying data-link";
+		break;
+	case DLADM_STATUS_INVALIDMACADDRINUSE:
+		s = "MAC address is already in use";
+		break;
+	case DLADM_STATUS_MACFACTORYSLOTINVALID:
+		s = "invalid factory MAC address slot";
+		break;
+	case DLADM_STATUS_MACFACTORYSLOTUSED:
+		s = "factory MAC address slot already used";
+		break;
+	case DLADM_STATUS_MACFACTORYSLOTALLUSED:
+		s = "all factory MAC address slots are in use";
+		break;
+	case DLADM_STATUS_MACFACTORYNOTSUP:
+		s = "factory MAC address slots not supported";
+		break;
+	case DLADM_STATUS_INVALIDMACPREFIX:
+		s = "Invalid MAC address prefix value";
+		break;
+	case DLADM_STATUS_INVALIDMACPREFIXLEN:
+		s = "Invalid MAC address prefix length";
+		break;
+	case DLADM_STATUS_CPUMAX:
+		s = "non-existent processor ID";
+		break;
+	case DLADM_STATUS_CPUERR:
+		s = "could not determine processor status";
+		break;
+	case DLADM_STATUS_CPUNOTONLINE:
+		s = "processor not online";
+		break;
+	case DLADM_STATUS_DB_NOTFOUND:
+		s = "database not found";
+		break;
+	case DLADM_STATUS_DB_PARSE_ERR:
+		s = "database parse error";
+		break;
+	case DLADM_STATUS_PROP_PARSE_ERR:
+		s = "property parse error";
+		break;
+	case DLADM_STATUS_ATTR_PARSE_ERR:
+		s = "attribute parse error";
+		break;
+	case DLADM_STATUS_FLOW_DB_ERR:
+		s = "flow database error";
+		break;
+	case DLADM_STATUS_FLOW_DB_OPEN_ERR:
+		s = "flow database open error";
+		break;
+	case DLADM_STATUS_FLOW_DB_PARSE_ERR:
+		s = "flow database parse error";
+		break;
+	case DLADM_STATUS_FLOWPROP_DB_PARSE_ERR:
+		s = "flow property database parse error";
+		break;
+	case DLADM_STATUS_FLOW_ADD_ERR:
+		s = "flow add error";
+		break;
+	case DLADM_STATUS_FLOW_WALK_ERR:
+		s = "flow walk error";
+		break;
+	case DLADM_STATUS_FLOW_IDENTICAL:
+		s = "a flow with identical attributes exists";
+		break;
+	case DLADM_STATUS_FLOW_INCOMPATIBLE:
+		s = "flow(s) with incompatible attributes exists";
+		break;
+	case DLADM_STATUS_FLOW_EXISTS:
+		s = "link still has flows";
+		break;
+	case DLADM_STATUS_PERSIST_FLOW_EXISTS:
+		s = "persistent flow with the same name exists";
+		break;
+	case DLADM_STATUS_INVALID_IP:
+		s = "invalid IP address";
+		break;
+	case DLADM_STATUS_INVALID_PREFIXLEN:
+		s = "invalid IP prefix length";
+		break;
+	case DLADM_STATUS_INVALID_PROTOCOL:
+		s = "invalid IP protocol";
+		break;
+	case DLADM_STATUS_INVALID_PORT:
+		s = "invalid port number";
+		break;
+	case DLADM_STATUS_INVALID_DSF:
+		s = "invalid dsfield";
+		break;
+	case DLADM_STATUS_INVALID_DSFMASK:
+		s = "invalid dsfield mask";
+		break;
+	case DLADM_STATUS_INVALID_MACMARGIN:
+		s = "MTU check failed, use lower MTU or -f option";
+		break;
+	case DLADM_STATUS_BADPROP:
+		s = "invalid property";
+		break;
+	case DLADM_STATUS_MINMAXBW:
+		s = "minimum value for maxbw is 1.2M";
+		break;
+	case DLADM_STATUS_NO_HWRINGS:
+		s = "request hw rings failed";
+		break;
 	default:
 		s = "<unknown error>";
 		break;
@@ -169,11 +281,100 @@ dladm_errno2status(int err)
 		return (DLADM_STATUS_LINKBUSY);
 	case EAGAIN:
 		return (DLADM_STATUS_TRYAGAIN);
+	case ENOTEMPTY:
+		return (DLADM_STATUS_FLOW_EXISTS);
+	case EOPNOTSUPP:
+		return (DLADM_STATUS_FLOW_INCOMPATIBLE);
+	case EALREADY:
+		return (DLADM_STATUS_FLOW_IDENTICAL);
 	default:
 		return (DLADM_STATUS_FAILED);
 	}
 }
 
+dladm_status_t
+dladm_str2bw(char *oarg, uint64_t *bw)
+{
+	char		*endp = NULL;
+	int64_t		n;
+	int		mult = 1;
+
+	n = strtoull(oarg, &endp, 10);
+
+	if ((errno != 0) || (strlen(endp) > 1))
+		return (DLADM_STATUS_BADARG);
+
+	if (n < 0)
+		return (DLADM_STATUS_BADVAL);
+
+	switch (*endp) {
+	case 'k':
+	case 'K':
+		mult = 1000;
+		break;
+	case 'm':
+	case 'M':
+	case '\0':
+		mult = 1000000;
+		break;
+	case 'g':
+	case 'G':
+		mult = 1000000000;
+		break;
+	case '%':
+		/*
+		 * percentages not supported for now,
+		 * see RFE 6540675
+		 */
+		return (DLADM_STATUS_NOTSUP);
+	default:
+		return (DLADM_STATUS_BADVAL);
+	}
+
+	*bw = n * mult;
+
+	/* check for overflow */
+	if (*bw / mult != n)
+		return (DLADM_STATUS_BADARG);
+
+	return (DLADM_STATUS_OK);
+}
+
+/*
+ * Convert bandwidth in bps to a string in mpbs.  For values greater
+ * than 1mbps or 1000000, print a whole mbps value.  For values that
+ * have fractional Mbps in whole Kbps , print the bandwidth in a manner
+ * simlilar to a floating point format.
+ *
+ *        bps       string
+ *          0            0
+ *        100            0
+ *       2000        0.002
+ *     431000        0.431
+ *    1000000            1
+ *    1030000        1.030
+ *  100000000          100
+ */
+const char *
+dladm_bw2str(int64_t bw, char *buf)
+{
+	int kbps, mbps;
+
+	kbps = (bw%1000000)/1000;
+	mbps = bw/1000000;
+	if (kbps != 0) {
+		if (mbps == 0)
+			(void) snprintf(buf, DLADM_STRSIZE, "0.%03u", kbps);
+		else
+			(void) snprintf(buf, DLADM_STRSIZE, "%5u.%03u", mbps,
+			    kbps);
+	} else {
+		(void) snprintf(buf, DLADM_STRSIZE, "%5u", mbps);
+	}
+
+	return (buf);
+}
+
 #define	LOCK_DB_PERMS	S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH
 
 static int
@@ -241,6 +442,9 @@ dladm_class2str(datalink_class_t class, char *buf)
 	case DATALINK_CLASS_VNIC:
 		s = "vnic";
 		break;
+	case DATALINK_CLASS_ETHERSTUB:
+		s = "etherstub";
+		break;
 	default:
 		s = "unknown";
 		break;
@@ -491,3 +695,123 @@ dladm_valid_linkname(const char *link)
 
 	return (B_TRUE);
 }
+
+/*
+ * Convert priority string to a value.
+ */
+dladm_status_t
+dladm_str2pri(char *token, mac_priority_level_t *pri)
+{
+	if (strlen(token) == strlen("low") &&
+	    strncasecmp(token, "low", strlen("low")) == 0) {
+		*pri = MPL_LOW;
+	} else if (strlen(token) == strlen("medium") &&
+	    strncasecmp(token, "medium", strlen("medium")) == 0) {
+		*pri = MPL_MEDIUM;
+	} else if (strlen(token) == strlen("high") &&
+	    strncasecmp(token, "high", strlen("high")) == 0) {
+		*pri = MPL_HIGH;
+	} else {
+		return (DLADM_STATUS_BADVAL);
+	}
+	return (DLADM_STATUS_OK);
+}
+
+/*
+ * Convert priority value to a string.
+ */
+const char *
+dladm_pri2str(mac_priority_level_t pri, char *buf)
+{
+	const char	*s;
+
+	switch (pri) {
+	case MPL_LOW:
+		s = "low";
+		break;
+	case MPL_MEDIUM:
+		s = "medium";
+		break;
+	case MPL_HIGH:
+		s = "high";
+		break;
+	default:
+		s = "--";
+		break;
+	}
+	(void) snprintf(buf, DLADM_STRSIZE, "%s", dgettext(TEXT_DOMAIN, s));
+	return (buf);
+}
+
+void
+dladm_free_args(dladm_arg_list_t *list)
+{
+	if (list != NULL) {
+		free(list->al_buf);
+		free(list);
+	}
+}
+
+dladm_status_t
+dladm_parse_args(char *str, dladm_arg_list_t **listp, boolean_t novalues)
+{
+	dladm_arg_list_t	*list;
+	dladm_arg_info_t	*aip;
+	char			*buf, *curr;
+	int			len, i;
+
+	list = malloc(sizeof (dladm_arg_list_t));
+	if (list == NULL)
+		return (dladm_errno2status(errno));
+
+	list->al_count = 0;
+	list->al_buf = buf = strdup(str);
+	if (buf == NULL)
+		return (dladm_errno2status(errno));
+
+	curr = buf;
+	len = strlen(buf);
+	aip = NULL;
+	for (i = 0; i < len; i++) {
+		char		c = buf[i];
+		boolean_t	match = (c == '=' || c == ',');
+
+		if (!match && i != len - 1)
+			continue;
+
+		if (match) {
+			buf[i] = '\0';
+			if (*curr == '\0')
+				goto fail;
+		}
+
+		if (aip != NULL && c != '=') {
+			if (aip->ai_count > DLADM_MAX_ARG_VALS)
+				goto fail;
+
+			if (novalues)
+				goto fail;
+
+			aip->ai_val[aip->ai_count] = curr;
+			aip->ai_count++;
+		} else {
+			if (list->al_count > DLADM_MAX_ARG_VALS)
+				goto fail;
+
+			aip = &list->al_info[list->al_count];
+			aip->ai_name = curr;
+			aip->ai_count = 0;
+			list->al_count++;
+			if (c == ',')
+				aip = NULL;
+		}
+		curr = buf + i + 1;
+	}
+
+	*listp = list;
+	return (DLADM_STATUS_OK);
+
+fail:
+	dladm_free_args(list);
+	return (DLADM_STATUS_FAILED);
+}
diff --git a/usr/src/lib/libdladm/common/libdladm.h b/usr/src/lib/libdladm/common/libdladm.h
index df69a54615..a76245d478 100644
--- a/usr/src/lib/libdladm/common/libdladm.h
+++ b/usr/src/lib/libdladm/common/libdladm.h
@@ -26,7 +26,7 @@
 #ifndef _LIBDLADM_H
 #define	_LIBDLADM_H
 
-#include <sys/dls.h>
+#include <sys/dls_mgmt.h>
 #include <sys/dlpi.h>
 
 /*
@@ -60,16 +60,28 @@ extern "C" {
  *
  *  - DLADM_OPT_PREFIX:
  *    The function requests to generate a link name using the specified prefix.
+ *
+ *  - DLADM_OPT_VLAN:
+ *    Signifies VLAN creation code path
+ *
+ *  - DLADM_OPT_HWRINGS:
+ *    Requires a hardware group of rings when creating a vnic.
  */
 #define	DLADM_OPT_ACTIVE	0x00000001
 #define	DLADM_OPT_PERSIST	0x00000002
 #define	DLADM_OPT_CREATE	0x00000004
 #define	DLADM_OPT_FORCE		0x00000008
 #define	DLADM_OPT_PREFIX	0x00000010
+#define	DLADM_OPT_ANCHOR	0x00000020
+#define	DLADM_OPT_VLAN		0x00000040
+#define	DLADM_OPT_HWRINGS	0x00000080
 
 #define	DLADM_WALK_TERMINATE	0
 #define	DLADM_WALK_CONTINUE	-1
 
+#define	DLADM_MAX_ARG_CNT	32
+#define	DLADM_MAX_ARG_VALS	32
+
 typedef enum {
 	DLADM_STATUS_OK = 0,
 	DLADM_STATUS_BADARG,
@@ -99,7 +111,44 @@ typedef enum {
 	DLADM_STATUS_VIDINVAL,
 	DLADM_STATUS_NONOTIF,
 	DLADM_STATUS_TRYAGAIN,
-	DLADM_STATUS_NOTDEFINED
+	DLADM_STATUS_BADTIMEVAL,
+	DLADM_STATUS_INVALIDMACADDR,
+	DLADM_STATUS_INVALIDMACADDRNIC,
+	DLADM_STATUS_INVALIDMACADDRINUSE,
+	DLADM_STATUS_MACFACTORYSLOTINVALID,
+	DLADM_STATUS_MACFACTORYSLOTUSED,
+	DLADM_STATUS_MACFACTORYSLOTALLUSED,
+	DLADM_STATUS_MACFACTORYNOTSUP,
+	DLADM_STATUS_INVALIDMACPREFIX,
+	DLADM_STATUS_INVALIDMACPREFIXLEN,
+	DLADM_STATUS_CPUMAX,
+	DLADM_STATUS_CPUERR,
+	DLADM_STATUS_CPUNOTONLINE,
+	DLADM_STATUS_DB_NOTFOUND,
+	DLADM_STATUS_DB_PARSE_ERR,
+	DLADM_STATUS_PROP_PARSE_ERR,
+	DLADM_STATUS_ATTR_PARSE_ERR,
+	DLADM_STATUS_FLOW_DB_ERR,
+	DLADM_STATUS_FLOW_DB_OPEN_ERR,
+	DLADM_STATUS_FLOW_DB_PARSE_ERR,
+	DLADM_STATUS_FLOWPROP_DB_PARSE_ERR,
+	DLADM_STATUS_FLOW_ADD_ERR,
+	DLADM_STATUS_FLOW_WALK_ERR,
+	DLADM_STATUS_FLOW_IDENTICAL,
+	DLADM_STATUS_FLOW_INCOMPATIBLE,
+	DLADM_STATUS_FLOW_EXISTS,
+	DLADM_STATUS_PERSIST_FLOW_EXISTS,
+	DLADM_STATUS_INVALID_IP,
+	DLADM_STATUS_INVALID_PREFIXLEN,
+	DLADM_STATUS_INVALID_PROTOCOL,
+	DLADM_STATUS_INVALID_PORT,
+	DLADM_STATUS_INVALID_DSF,
+	DLADM_STATUS_INVALID_DSFMASK,
+	DLADM_STATUS_INVALID_MACMARGIN,
+	DLADM_STATUS_NOTDEFINED,
+	DLADM_STATUS_BADPROP,
+	DLADM_STATUS_MINMAXBW,
+	DLADM_STATUS_NO_HWRINGS
 } dladm_status_t;
 
 typedef enum {
@@ -111,11 +160,63 @@ typedef enum {
 typedef int dladm_conf_t;
 #define	DLADM_INVALID_CONF	0
 
+typedef struct dladm_arg_info {
+	const char	*ai_name;
+	char		*ai_val[DLADM_MAX_ARG_VALS];
+	uint_t		ai_count;
+} dladm_arg_info_t;
+
+typedef struct dladm_arg_list {
+	dladm_arg_info_t	al_info[DLADM_MAX_ARG_CNT];
+	uint_t			al_count;
+	char			*al_buf;
+} dladm_arg_list_t;
+
+typedef enum {
+	DLADM_LOGTYPE_LINK = 1,
+	DLADM_LOGTYPE_FLOW
+} dladm_logtype_t;
+
+typedef struct dladm_usage {
+	char		du_name[MAXLINKNAMELEN];
+	uint64_t	du_duration;
+	uint64_t	du_stime;
+	uint64_t	du_etime;
+	uint64_t	du_ipackets;
+	uint64_t	du_rbytes;
+	uint64_t	du_opackets;
+	uint64_t	du_obytes;
+	uint64_t	du_bandwidth;
+	boolean_t	du_last;
+} dladm_usage_t;
+
 extern const char	*dladm_status2str(dladm_status_t, char *);
 extern dladm_status_t	dladm_set_rootdir(const char *);
 extern const char	*dladm_class2str(datalink_class_t, char *);
 extern const char	*dladm_media2str(uint32_t, char *);
 extern boolean_t	dladm_valid_linkname(const char *);
+extern dladm_status_t	dladm_str2bw(char *, uint64_t *);
+extern const char	*dladm_bw2str(int64_t, char *);
+
+extern dladm_status_t	dladm_parse_flow_props(char *, dladm_arg_list_t **,
+			    boolean_t);
+extern dladm_status_t	dladm_parse_link_props(char *, dladm_arg_list_t **,
+			    boolean_t);
+extern void		dladm_free_props(dladm_arg_list_t *);
+extern dladm_status_t	dladm_parse_flow_attrs(char *, dladm_arg_list_t **,
+			    boolean_t);
+extern void		dladm_free_attrs(dladm_arg_list_t *);
+
+extern dladm_status_t	dladm_start_usagelog(dladm_logtype_t, uint_t);
+extern dladm_status_t	dladm_stop_usagelog(dladm_logtype_t);
+extern dladm_status_t	dladm_walk_usage_res(int (*)(dladm_usage_t *, void *),
+			    int, char *, char *, char *, char *, void *);
+extern dladm_status_t	dladm_walk_usage_time(int (*)(dladm_usage_t *, void *),
+			    int, char *, char *, char *, void *);
+extern dladm_status_t	dladm_usage_summary(int (*)(dladm_usage_t *, void *),
+			    int, char *, void *);
+extern dladm_status_t	dladm_usage_dates(int (*)(dladm_usage_t *, void *),
+			    int, char *, char *, void *);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/lib/libdladm/common/libdladm_impl.h b/usr/src/lib/libdladm/common/libdladm_impl.h
index d4a5a52445..41f09b3a46 100644
--- a/usr/src/lib/libdladm/common/libdladm_impl.h
+++ b/usr/src/lib/libdladm/common/libdladm_impl.h
@@ -36,18 +36,17 @@ extern "C" {
 #define	MAXLINELEN		1024
 #define	BUFLEN(lim, ptr)	(((lim) > (ptr)) ? ((lim) - (ptr)) : 0)
 
-typedef struct val_desc {
-	char		*vd_name;
-	uintptr_t	vd_val;
-} val_desc_t;
-
-#define	VALCNT(vals)	(sizeof ((vals)) / sizeof (val_desc_t))
-
 extern dladm_status_t	dladm_errno2status(int);
 extern dladm_status_t   i_dladm_rw_db(const char *, mode_t,
 			    dladm_status_t (*)(void *, FILE *, FILE *),
 			    void *, boolean_t);
 
+extern const char	*dladm_pri2str(mac_priority_level_t, char *);
+extern dladm_status_t	dladm_str2pri(char *, mac_priority_level_t *);
+extern dladm_status_t	dladm_parse_args(char *, dladm_arg_list_t **,
+			    boolean_t);
+extern void		dladm_free_args(dladm_arg_list_t *);
+
 /*
  * Link attributes persisted by dlmgmtd.
  */
@@ -65,11 +64,64 @@ extern dladm_status_t   i_dladm_rw_db(const char *, mode_t,
 #define	FPORTS		"portnames"	/* string */
 #define	FPOLICY		"policy"	/* uint64_t */
 #define	FFIXMACADDR	"fix_macaddr"	/* boolean_t */
-#define	FMACADDR	"macaddr"	/* string */
 #define	FFORCE		"force"		/* boolean_t */
 #define	FLACPMODE	"lacp_mode"	/* uint64_t */
 #define	FLACPTIMER	"lacp_timer"	/* uint64_t */
 
+/*
+ * Set for VNICs only
+ */
+#define	FMADDRTYPE	"maddrtype"	/* uint64_t */
+#define	FMADDRLEN	"maddrlen"	/* uint64_t */
+#define	FMADDRSLOT	"maddrslot"	/* uint64_t */
+#define	FMADDRPREFIXLEN	"maddrpreflen"	/* uint64_t */
+#define	FHWRINGS	"hwrings"	/* boolean_t */
+
+/*
+ * Common fields
+ */
+#define	FMACADDR	"macaddr"	/* string */
+
+/*
+ * Data structures used for implementing temporary properties
+ */
+
+typedef struct val_desc {
+	char		*vd_name;
+	uintptr_t	vd_val;
+} val_desc_t;
+
+#define	VALCNT(vals)	(sizeof ((vals)) / sizeof (val_desc_t))
+
+extern dladm_status_t	dladm_link_proplist_extract(dladm_arg_list_t *,
+			    mac_resource_props_t *);
+
+extern dladm_status_t	dladm_flow_proplist_extract(dladm_arg_list_t *,
+			    mac_resource_props_t *);
+
+/*
+ * The prop extract() callback.
+ *
+ * rp_extract extracts the kernel structure from the val_desc_t created
+ * by the pd_check function.
+ */
+typedef	dladm_status_t	rp_extractf_t(val_desc_t *propval, void *arg,
+				uint_t cnt);
+extern rp_extractf_t	do_extract_maxbw, do_extract_priority,
+			do_extract_cpus;
+
+typedef struct resource_prop_s {
+	/*
+	 * resource property name
+	 */
+	char		*rp_name;
+
+	/*
+	 * callback to extract kernel structure
+	 */
+	rp_extractf_t	*rp_extract;
+} resource_prop_t;
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/lib/libdladm/common/libdlaggr.c b/usr/src/lib/libdladm/common/libdlaggr.c
index dba84441ea..5a155fcad9 100644
--- a/usr/src/lib/libdladm/common/libdlaggr.c
+++ b/usr/src/lib/libdladm/common/libdlaggr.c
@@ -37,6 +37,7 @@
 #include <libintl.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
+#include <sys/dld.h>
 #include <libdllink.h>
 #include <libdlvlan.h>
 #include <libdlaggr.h>
@@ -1110,7 +1111,7 @@ dladm_aggr_create(const char *name, uint16_t key, uint32_t nports,
 	for (i = 0; i < nports; i++) {
 		if ((dladm_datalink_id2info(ports[i].lp_linkid, NULL,
 		    &class, &media, NULL, 0) != DLADM_STATUS_OK) ||
-		    (class != DATALINK_CLASS_PHYS) && (media != DL_ETHER)) {
+		    !((class == DATALINK_CLASS_PHYS) && (media == DL_ETHER))) {
 			return (DLADM_STATUS_BADARG);
 		}
 	}
diff --git a/usr/src/lib/libdladm/common/libdlflow.c b/usr/src/lib/libdladm/common/libdlflow.c
new file mode 100644
index 0000000000..3ec77705a7
--- /dev/null
+++ b/usr/src/lib/libdladm/common/libdlflow.c
@@ -0,0 +1,903 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ethernet.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <strings.h>
+#include <libintl.h>
+#include <netdb.h>
+#include <net/if_types.h>
+#include <net/if_dl.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <libdlflow.h>
+#include <libdlflow_impl.h>
+#include <libdladm_impl.h>
+
+/* minimum buffer size for DLDIOCWALKFLOW */
+#define	MIN_INFO_SIZE	(4 * 1024)
+
+#define	DLADM_FLOW_DB		"/etc/dladm/flowadm.conf"
+#define	DLADM_FLOW_DB_TMP	"/etc/dladm/flowadm.conf.new"
+#define	DLADM_FLOW_DB_LOCK	"/tmp/flowadm.conf.lock"
+
+#define	DLADM_FLOW_DB_PERMS	S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH
+#define	DLADM_FLOW_DB_OWNER	UID_DLADM
+#define	DLADM_FLOW_DB_GROUP	GID_SYS
+
+#define	BLANK_LINE(s)	((s[0] == '\0') || (s[0] == '#') || (s[0] == '\n'))
+#define	MAXLINELEN	1024
+#define	MAXPATHLEN	1024
+
+#define	V4_PART_OF_V6(v6)	((v6)._S6_un._S6_u32[3])
+
+/* database file parameters */
+static const char *BW_LIMIT = "bw_limit";
+static const char *PRIORITY = "priority";
+static const char *LOCAL_IP_ADDR = "local_ip";
+static const char *REMOTE_IP_ADDR = "remote_ip";
+static const char *TRANSPORT = "transport";
+static const char *LOCAL_PORT = "local_port";
+static const char *DSFIELD = "dsfield";
+
+/*
+ * Open and lock the flowadm configuration file lock. The lock is
+ * acquired as a reader (F_RDLCK) or writer (F_WRLCK).
+ */
+static int
+i_dladm_flow_lock_db(short type)
+{
+	int lock_fd;
+	struct flock lock;
+
+	if ((lock_fd = open(DLADM_FLOW_DB_LOCK, O_RDWR | O_CREAT | O_TRUNC,
+	    DLADM_FLOW_DB_PERMS)) < 0)
+		return (-1);
+
+	lock.l_type = type;
+	lock.l_whence = SEEK_SET;
+	lock.l_start = 0;
+	lock.l_len = 0;
+
+	if (fcntl(lock_fd, F_SETLKW, &lock) < 0) {
+		(void) close(lock_fd);
+		(void) unlink(DLADM_FLOW_DB_LOCK);
+		return (-1);
+	}
+	return (lock_fd);
+}
+
+/*
+ * Unlock and close the specified file.
+ */
+static void
+i_dladm_flow_unlock_db(int fd)
+{
+	struct flock lock;
+
+	if (fd < 0)
+		return;
+
+	lock.l_type = F_UNLCK;
+	lock.l_whence = SEEK_SET;
+	lock.l_start = 0;
+	lock.l_len = 0;
+
+	(void) fcntl(fd, F_SETLKW, &lock);
+	(void) close(fd);
+	(void) unlink(DLADM_FLOW_DB_LOCK);
+}
+
+/*
+ * Parse one line of the link flowadm DB
+ * Returns -1 on failure, 0 on success.
+ */
+dladm_status_t
+dladm_flow_parse_db(char *line, dld_flowinfo_t *attr)
+{
+	char		*token;
+	char		*value, *name = NULL;
+	char		*endp = NULL;
+	char		*lasts = NULL;
+	dladm_status_t	status = DLADM_STATUS_FLOW_DB_PARSE_ERR;
+
+	bzero(attr, sizeof (*attr));
+
+	/* flow name */
+	if ((token = strtok_r(line, " \t", &lasts)) == NULL)
+		goto done;
+
+	if (strlcpy(attr->fi_flowname, token, MAXNAMELEN) >= MAXNAMELEN)
+		goto done;
+
+	/* resource control and flow descriptor parameters */
+	while ((token = strtok_r(NULL, " \t", &lasts)) != NULL) {
+		if ((name = strdup(token)) == NULL)
+			goto done;
+
+		(void) strtok(name, "=");
+		value = strtok(NULL, "=");
+		if (value == NULL)
+			goto done;
+
+		if (strcmp(name, "linkid") == 0) {
+			if ((attr->fi_linkid =
+			    (uint32_t)strtol(value, &endp, 10)) ==
+			    DATALINK_INVALID_LINKID)
+				goto done;
+
+		} else if (strcmp(name, BW_LIMIT) == 0) {
+			attr->fi_resource_props.mrp_mask |=
+			    MRP_MAXBW;
+			attr->fi_resource_props.mrp_maxbw =
+			    (uint64_t)strtol(value, &endp, 0);
+
+		} else if (strcmp(name, PRIORITY) == 0) {
+			attr->fi_resource_props.mrp_mask |= MRP_PRIORITY;
+			status = dladm_str2pri(value,
+			    &attr->fi_resource_props.mrp_priority);
+			if (status != DLADM_STATUS_OK)
+				goto done;
+
+		} else if (strcmp(name, DSFIELD) == 0) {
+			status = do_check_dsfield(value,
+			    &attr->fi_flow_desc);
+			if (status != DLADM_STATUS_OK)
+				goto done;
+
+		} else if (strcmp(name, LOCAL_IP_ADDR) == 0) {
+			status = do_check_ip_addr(value, B_TRUE,
+			    &attr->fi_flow_desc);
+			if (status != DLADM_STATUS_OK)
+				goto done;
+
+		} else if (strcmp(name, REMOTE_IP_ADDR) == 0) {
+			status = do_check_ip_addr(value, B_FALSE,
+			    &attr->fi_flow_desc);
+			if (status != DLADM_STATUS_OK)
+				goto done;
+
+		} else if (strcmp(name, TRANSPORT) == 0) {
+			attr->fi_flow_desc.fd_mask |= FLOW_IP_PROTOCOL;
+			attr->fi_flow_desc.fd_protocol =
+			    (uint8_t)strtol(value, &endp, 0);
+
+		} else if (strcmp(name, LOCAL_PORT) == 0) {
+			attr->fi_flow_desc.fd_mask |= FLOW_ULP_PORT_LOCAL;
+			attr->fi_flow_desc.fd_local_port =
+			    (uint16_t)strtol(value, &endp, 10);
+			attr->fi_flow_desc.fd_local_port =
+			    htons(attr->fi_flow_desc.fd_local_port);
+		}
+		free(name);
+		name = NULL;
+	}
+	if (attr->fi_linkid != DATALINK_INVALID_LINKID)
+		status = DLADM_STATUS_OK;
+done:
+	free(name);
+	return (status);
+}
+
+#define	FPRINTF_ERR(fcall) if ((fcall) < 0) return (-1);
+
+/*
+ * Write the attribute of a group to the specified file. Returns 0 on
+ * success, -1 on failure.
+ */
+static int
+i_dladm_flow_fput_grp(FILE *fp, dld_flowinfo_t *attr)
+{
+
+	FPRINTF_ERR(fprintf(fp, "%s\tlinkid=%d\t",
+	    attr->fi_flowname, attr->fi_linkid));
+
+	/* flow policy */
+	if (attr->fi_resource_props.mrp_mask & MRP_MAXBW)
+		FPRINTF_ERR(fprintf(fp, "%s=%" PRIu64 "\t", BW_LIMIT,
+		    attr->fi_resource_props.mrp_maxbw));
+
+	if (attr->fi_resource_props.mrp_mask & MRP_PRIORITY)
+		FPRINTF_ERR(fprintf(fp, "%s=%d\t", PRIORITY,
+		    attr->fi_resource_props.mrp_priority));
+
+	/* flow descriptor */
+	if (attr->fi_flow_desc.fd_mask & FLOW_IP_DSFIELD)
+		FPRINTF_ERR(fprintf(fp, "%s=%x:%x\t", DSFIELD,
+		    attr->fi_flow_desc.fd_dsfield,
+		    attr->fi_flow_desc.fd_dsfield_mask));
+
+	if (attr->fi_flow_desc.fd_mask & FLOW_IP_LOCAL) {
+		char abuf[INET6_ADDRSTRLEN], *ap;
+		struct in_addr ipaddr;
+		int prefix_len, prefix_max;
+
+		if (attr->fi_flow_desc.fd_ipversion != 6) {
+			ipaddr.s_addr =
+			    attr->fi_flow_desc.
+			    fd_local_addr._S6_un._S6_u32[3];
+
+			ap = inet_ntoa(ipaddr);
+			prefix_max = IP_ABITS;
+		} else {
+			(void) inet_ntop(AF_INET6,
+			    &attr->fi_flow_desc.fd_local_addr,
+			    abuf, INET6_ADDRSTRLEN);
+
+			ap = abuf;
+			prefix_max = IPV6_ABITS;
+		}
+		(void) dladm_mask2prefixlen(
+		    &attr->fi_flow_desc.fd_local_netmask, prefix_max,
+		    &prefix_len);
+
+		FPRINTF_ERR(fprintf(fp, "%s=%s/%d\t", LOCAL_IP_ADDR,
+		    ap, prefix_len));
+	}
+	if (attr->fi_flow_desc.fd_mask & FLOW_IP_REMOTE) {
+		char abuf[INET6_ADDRSTRLEN], *ap;
+		struct in_addr ipaddr;
+		int prefix_len, prefix_max;
+
+		if (attr->fi_flow_desc.fd_ipversion != 6) {
+			ipaddr.s_addr =
+			    attr->fi_flow_desc.
+			    fd_remote_addr._S6_un._S6_u32[3];
+
+			ap = inet_ntoa(ipaddr);
+			prefix_max = IP_ABITS;
+		} else {
+			(void) inet_ntop(AF_INET6,
+			    &(attr->fi_flow_desc.fd_remote_addr),
+			    abuf, INET6_ADDRSTRLEN);
+
+			ap = abuf;
+			prefix_max = IPV6_ABITS;
+		}
+		(void) dladm_mask2prefixlen(
+		    &attr->fi_flow_desc.fd_remote_netmask, prefix_max,
+		    &prefix_len);
+
+		FPRINTF_ERR(fprintf(fp, "%s=%s/%d\t", REMOTE_IP_ADDR,
+		    ap, prefix_len));
+	}
+	if (attr->fi_flow_desc.fd_mask & FLOW_IP_PROTOCOL)
+		FPRINTF_ERR(fprintf(fp, "%s=%d\t", TRANSPORT,
+		    attr->fi_flow_desc.fd_protocol));
+
+	if (attr->fi_flow_desc.fd_mask & FLOW_ULP_PORT_LOCAL)
+		FPRINTF_ERR(fprintf(fp, "%s=%d\t", LOCAL_PORT,
+		    ntohs(attr->fi_flow_desc.fd_local_port)));
+
+	FPRINTF_ERR(fprintf(fp, "\n"));
+
+	return (0);
+
+}
+
+static dladm_status_t
+i_dladm_flow_walk_rw_db(int (*fn)(void *, dld_flowinfo_t *),
+    void *arg,
+    const char *root)
+{
+	FILE *fp, *nfp;
+	int nfd, fn_rc, lock_fd;
+	char line[MAXLINELEN];
+	dld_flowinfo_t attr;
+	char *db_file, *tmp_db_file;
+	char db_file_buf[MAXPATHLEN];
+	char tmp_db_file_buf[MAXPATHLEN];
+	dladm_status_t	status = DLADM_STATUS_FLOW_DB_ERR;
+
+	if (root == NULL) {
+		db_file = DLADM_FLOW_DB;
+		tmp_db_file = DLADM_FLOW_DB_TMP;
+	} else {
+		(void) snprintf(db_file_buf, MAXPATHLEN, "%s%s", root,
+		    DLADM_FLOW_DB);
+		(void) snprintf(tmp_db_file_buf, MAXPATHLEN, "%s%s", root,
+		    DLADM_FLOW_DB_TMP);
+		db_file = db_file_buf;
+		tmp_db_file = tmp_db_file_buf;
+	}
+
+	if ((lock_fd = i_dladm_flow_lock_db(F_WRLCK)) < 0)
+		return (DLADM_STATUS_FLOW_DB_ERR);
+
+	if ((fp = fopen(db_file, "r")) == NULL) {
+		i_dladm_flow_unlock_db(lock_fd);
+		return (DLADM_STATUS_FLOW_DB_OPEN_ERR);
+	}
+
+	if ((nfd = open(tmp_db_file, O_WRONLY|O_CREAT|O_TRUNC,
+	    DLADM_FLOW_DB_PERMS)) == -1) {
+		(void) fclose(fp);
+		i_dladm_flow_unlock_db(lock_fd);
+		return (DLADM_STATUS_FLOW_DB_OPEN_ERR);
+	}
+
+	if ((nfp = fdopen(nfd, "w")) == NULL) {
+		(void) close(nfd);
+		(void) fclose(fp);
+		(void) unlink(tmp_db_file);
+		i_dladm_flow_unlock_db(lock_fd);
+		return (DLADM_STATUS_FLOW_DB_OPEN_ERR);
+	}
+
+	while (fgets(line, MAXLINELEN, fp) != NULL) {
+
+		/* skip comments */
+		if (BLANK_LINE(line)) {
+			if (fputs(line, nfp) == EOF)
+				goto failed;
+			continue;
+		}
+		(void) strtok(line, " \n");
+
+		if ((status = dladm_flow_parse_db(line, &attr)) !=
+		    DLADM_STATUS_OK)
+			goto failed;
+
+		fn_rc = fn(arg, &attr);
+
+		switch (fn_rc) {
+		case -1:
+			/* failure, stop walking */
+			goto failed;
+		case 0:
+			/*
+			 * Success, write group attributes, which could
+			 * have been modified by fn().
+			 */
+			if (i_dladm_flow_fput_grp(nfp, &attr) != 0)
+				goto failed;
+			break;
+		case 1:
+			/* skip current group */
+			break;
+		}
+	}
+	if (fchmod(nfd, DLADM_FLOW_DB_PERMS) == -1)
+		goto failed;
+
+	if (fchown(nfd, DLADM_FLOW_DB_OWNER, DLADM_FLOW_DB_GROUP) == -1)
+		goto failed;
+
+	if (fflush(nfp) == EOF)
+		goto failed;
+
+	(void) fclose(fp);
+	(void) fclose(nfp);
+
+	if (rename(tmp_db_file, db_file) == -1) {
+		(void) unlink(tmp_db_file);
+		i_dladm_flow_unlock_db(lock_fd);
+		return (DLADM_STATUS_FLOW_DB_ERR);
+	}
+	i_dladm_flow_unlock_db(lock_fd);
+	return (DLADM_STATUS_OK);
+
+failed:
+	(void) fclose(fp);
+	(void) fclose(nfp);
+	(void) unlink(tmp_db_file);
+	i_dladm_flow_unlock_db(lock_fd);
+
+	return (status);
+}
+
+/*
+ * Remove existing flow from DB.
+ */
+
+typedef struct remove_db_state {
+	dld_flowinfo_t	rs_newattr;
+	dld_flowinfo_t	rs_oldattr;
+	boolean_t	rs_found;
+} remove_db_state_t;
+
+static int
+i_dladm_flow_remove_db_fn(void *arg, dld_flowinfo_t *grp)
+{
+	remove_db_state_t *state = (remove_db_state_t *)arg;
+	dld_flowinfo_t *attr = &state->rs_newattr;
+
+	if ((strcmp(grp->fi_flowname, attr->fi_flowname)) != 0)
+		return (0);
+	else {
+		bcopy(grp, &state->rs_oldattr,
+		    sizeof (dld_flowinfo_t));
+		state->rs_found = B_TRUE;
+		return (1);
+	}
+}
+
+/* ARGSUSED */
+static int
+i_dladm_flow_remove_db(remove_db_state_t *state, const char *root)
+{
+	if (i_dladm_flow_walk_rw_db(i_dladm_flow_remove_db_fn, state, root)
+	    != 0)
+		return (-1);
+
+	if (!state->rs_found) {
+		errno = ENOENT;
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * Create a flow in the DB.
+ */
+
+typedef struct modify_db_state {
+	dld_flowinfo_t	ms_newattr;
+	dld_flowinfo_t	ms_oldattr;
+	boolean_t	ms_found;
+} modify_db_state_t;
+
+static dladm_status_t
+i_dladm_flow_create_db(dld_flowinfo_t *attr, const char *root)
+{
+	FILE 	*fp;
+	char 	line[MAXLINELEN];
+	char	*db_file;
+	char	db_file_buf[MAXPATHLEN];
+	int	lock_fd;
+	dladm_status_t	status = DLADM_STATUS_OK;
+
+	if (root == NULL) {
+		db_file = DLADM_FLOW_DB;
+	} else {
+		(void) snprintf(db_file_buf, MAXPATHLEN, "%s%s", root,
+		    DLADM_FLOW_DB);
+		db_file = db_file_buf;
+	}
+
+	if ((lock_fd = i_dladm_flow_lock_db(F_WRLCK)) < 0)
+		return (DLADM_STATUS_FLOW_DB_ERR);
+
+	if ((fp = fopen(db_file, "r+")) == NULL &&
+	    (fp = fopen(db_file, "w")) == NULL) {
+		i_dladm_flow_unlock_db(lock_fd);
+		return (DLADM_STATUS_FLOW_DB_OPEN_ERR);
+	}
+
+	/* look for existing group with same flowname */
+	while (fgets(line, MAXLINELEN, fp) != NULL) {
+		char *holder, *lasts;
+
+		/* skip comments */
+		if (BLANK_LINE(line))
+			continue;
+
+		/* ignore corrupted lines */
+		holder = strtok_r(line, " \t", &lasts);
+		if (holder == NULL)
+			continue;
+
+		/* flow id */
+		if (strcmp(holder, attr->fi_flowname) == 0) {
+			/* group with flow id already exists */
+			status = DLADM_STATUS_PERSIST_FLOW_EXISTS;
+			goto failed;
+		}
+	}
+	/*
+	 * If we get here, we've verified that no existing group with
+	 * the same flow id already exists. Its now time to add the new
+	 * group to the DB.
+	 */
+	if (i_dladm_flow_fput_grp(fp, attr) != 0)
+		status = DLADM_STATUS_FLOW_DB_PARSE_ERR;
+
+failed:
+	(void) fclose(fp);
+	i_dladm_flow_unlock_db(lock_fd);
+	return (status);
+}
+
+static dladm_status_t
+i_dladm_flow_add(char *flowname, datalink_id_t linkid, flow_desc_t *flowdesc,
+    mac_resource_props_t *mrp)
+{
+	dld_ioc_addflow_t	attr;
+	int			fd;
+
+	/* create flow */
+	bzero(&attr, sizeof (attr));
+	bcopy(flowdesc, &attr.af_flow_desc, sizeof (flow_desc_t));
+	if (mrp != NULL) {
+		bcopy(mrp, &attr.af_resource_props,
+		    sizeof (mac_resource_props_t));
+	}
+
+	(void) strlcpy(attr.af_name, flowname, sizeof (attr.af_name));
+	attr.af_linkid = linkid;
+
+	fd = open(DLD_CONTROL_DEV, O_RDWR);
+	if (fd < 0)
+		return (dladm_errno2status(errno));
+
+	if (ioctl(fd, DLDIOC_ADDFLOW, &attr) < 0) {
+		(void) close(fd);
+		return (dladm_errno2status(errno));
+	}
+
+	(void) close(fd);
+
+	return (DLADM_STATUS_OK);
+}
+
+static dladm_status_t
+i_dladm_flow_remove(char *flowname)
+{
+	dld_ioc_removeflow_t	attr;
+	int			fd;
+	dladm_status_t		status = DLADM_STATUS_OK;
+
+	(void) strlcpy(attr.rf_name, flowname,
+	    sizeof (attr.rf_name));
+
+	fd = open(DLD_CONTROL_DEV, O_RDWR);
+	if (fd < 0)
+		return (dladm_errno2status(errno));
+
+	if (ioctl(fd, DLDIOC_REMOVEFLOW, &attr) < 0)
+		status = dladm_errno2status(errno);
+
+	(void) close(fd);
+
+	return (status);
+}
+
+
+/* ARGSUSED */
+dladm_status_t
+dladm_flow_add(datalink_id_t linkid, dladm_arg_list_t *attrlist,
+    dladm_arg_list_t *proplist, char *flowname, boolean_t tempop,
+    const char *root)
+{
+	dld_flowinfo_t		db_attr;
+	flow_desc_t		flowdesc;
+	mac_resource_props_t	mrp;
+	dladm_status_t		status;
+
+	/* Extract flow attributes from attrlist */
+	bzero(&flowdesc, sizeof (flow_desc_t));
+	if (attrlist != NULL && (status = dladm_flow_attrlist_extract(attrlist,
+	    &flowdesc)) != DLADM_STATUS_OK) {
+		return (status);
+	}
+
+	/* Extract resource_ctl and cpu_list from proplist */
+	bzero(&mrp, sizeof (mac_resource_props_t));
+	if (proplist != NULL && (status = dladm_flow_proplist_extract(proplist,
+	    &mrp)) != DLADM_STATUS_OK) {
+		return (status);
+	}
+
+	/* Add flow in kernel */
+	status = i_dladm_flow_add(flowname, linkid, &flowdesc, &mrp);
+	if (status != DLADM_STATUS_OK)
+		return (status);
+
+	/* Add flow to DB */
+	if (!tempop) {
+		bzero(&db_attr, sizeof (db_attr));
+		bcopy(&flowdesc, &db_attr.fi_flow_desc, sizeof (flow_desc_t));
+		(void) strlcpy(db_attr.fi_flowname, flowname,
+		    sizeof (db_attr.fi_flowname));
+		db_attr.fi_linkid = linkid;
+
+		if ((status = i_dladm_flow_create_db(&db_attr, root)) !=
+		    DLADM_STATUS_OK) {
+			(void) i_dladm_flow_remove(flowname);
+			return (status);
+		}
+		/* set flow properties */
+		if (proplist != NULL) {
+			status = i_dladm_set_flow_proplist_db(flowname,
+			    proplist);
+			if (status != DLADM_STATUS_OK) {
+				(void) i_dladm_flow_remove(flowname);
+				return (status);
+			}
+		}
+	}
+	return (status);
+}
+
+/*
+ * Remove a flow.
+ */
+/* ARGSUSED */
+dladm_status_t
+dladm_flow_remove(char *flowname, boolean_t tempop,
+    const char *root)
+{
+	remove_db_state_t		state;
+	dladm_status_t			status = DLADM_STATUS_OK;
+	dladm_status_t			s = DLADM_STATUS_OK;
+
+	/* remove flow */
+	status = i_dladm_flow_remove(flowname);
+	if ((status != DLADM_STATUS_OK) &&
+	    (tempop || status != DLADM_STATUS_NOTFOUND))
+		goto done;
+
+	/* remove flow from DB */
+	if (!tempop) {
+		bzero(&state, sizeof (state));
+		(void) strlcpy(state.rs_newattr.fi_flowname, flowname,
+		    sizeof (state.rs_newattr.fi_flowname));
+		state.rs_found = B_FALSE;
+
+		/* flow DB */
+		if (i_dladm_flow_remove_db(&state, root) < 0) {
+			s = dladm_errno2status(errno);
+			goto done;
+		}
+
+		/* flow prop DB */
+		s = dladm_set_flowprop(flowname, NULL, NULL, 0,
+		    DLADM_OPT_PERSIST, NULL);
+	}
+
+done:
+	if (!tempop) {
+		if (s == DLADM_STATUS_OK) {
+			if (status == DLADM_STATUS_NOTFOUND)
+				status = s;
+		} else {
+			if (s != DLADM_STATUS_NOTFOUND)
+				status = s;
+		}
+	}
+	return (status);
+}
+
+/*
+ * Get an existing flow in the DB.
+ */
+
+typedef struct get_db_state {
+	int		(*gs_fn)(dladm_flow_attr_t *, void *);
+	void		*gs_arg;
+	datalink_id_t	gs_linkid;
+} get_db_state_t;
+
+/*
+ * For each flow which matches the linkid, copy all flow information
+ * to a new dladm_flow_attr_t structure and call the provided
+ * function.  This is used to display perisistent flows from
+ * the database.
+ */
+
+static int
+i_dladm_flow_get_db_fn(void *arg, dld_flowinfo_t *grp)
+{
+	get_db_state_t		*state = (get_db_state_t *)arg;
+	dladm_flow_attr_t	attr;
+
+	if (grp->fi_linkid == state->gs_linkid) {
+		attr.fa_linkid = state->gs_linkid;
+		bcopy(grp->fi_flowname, &attr.fa_flowname,
+		    sizeof (attr.fa_flowname));
+		bcopy(&grp->fi_flow_desc, &attr.fa_flow_desc,
+		    sizeof (attr.fa_flow_desc));
+		bcopy(&grp->fi_resource_props, &attr.fa_resource_props,
+		    sizeof (attr.fa_resource_props));
+		(void) state->gs_fn(&attr, state->gs_arg);
+	}
+	return (0);
+}
+
+/*
+ * Walk through the flows defined on the system and for each flow
+ * invoke <fn>(<arg>, <flow>);
+ * Currently used for show-flow.
+ */
+/* ARGSUSED */
+dladm_status_t
+dladm_walk_flow(int (*fn)(dladm_flow_attr_t *, void *),
+    datalink_id_t linkid, void *arg, boolean_t persist)
+{
+	dld_flowinfo_t		*flow;
+	int			i, bufsize, fd;
+	dld_ioc_walkflow_t	*ioc = NULL;
+	dladm_flow_attr_t 	attr;
+	dladm_status_t		status = DLADM_STATUS_OK;
+
+	if (fn == NULL)
+		return (DLADM_STATUS_BADARG);
+
+	if (persist) {
+		get_db_state_t state;
+
+		bzero(&state, sizeof (state));
+
+		state.gs_linkid = linkid;
+		state.gs_fn = fn;
+		state.gs_arg = arg;
+		status = i_dladm_flow_walk_rw_db(i_dladm_flow_get_db_fn,
+		    &state, NULL);
+		if (status != DLADM_STATUS_OK)
+			return (status);
+	} else {
+		if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
+			return (dladm_errno2status(errno));
+
+		bufsize = MIN_INFO_SIZE;
+		if ((ioc = calloc(1, bufsize)) == NULL) {
+			status = dladm_errno2status(errno);
+			(void) close(fd);
+			return (status);
+		}
+
+		ioc->wf_linkid = linkid;
+		ioc->wf_len = bufsize - sizeof (*ioc);
+
+		while (ioctl(fd, DLDIOC_WALKFLOW, ioc) < 0) {
+			if (errno == ENOSPC) {
+				bufsize *= 2;
+				ioc = realloc(ioc, bufsize);
+				if (ioc != NULL) {
+					ioc->wf_linkid = linkid;
+					ioc->wf_len = bufsize - sizeof (*ioc);
+					continue;
+				}
+			}
+			goto bail;
+		}
+
+		flow = (dld_flowinfo_t *)(void *)(ioc + 1);
+		for (i = 0; i < ioc->wf_nflows; i++, flow++) {
+			bzero(&attr, sizeof (attr));
+
+			attr.fa_linkid = flow->fi_linkid;
+			bcopy(&flow->fi_flowname, &attr.fa_flowname,
+			    sizeof (attr.fa_flowname));
+			bcopy(&flow->fi_flow_desc, &attr.fa_flow_desc,
+			    sizeof (attr.fa_flow_desc));
+			bcopy(&flow->fi_resource_props, &attr.fa_resource_props,
+			    sizeof (attr.fa_resource_props));
+
+			if (fn(&attr, arg) == DLADM_WALK_TERMINATE)
+				break;
+		}
+	}
+
+bail:
+	free(ioc);
+	(void) close(fd);
+	return (status);
+}
+
+dladm_status_t
+dladm_flow_init(void)
+{
+	flow_desc_t		flowdesc;
+	datalink_id_t		linkid;
+	dladm_status_t		s, status = DLADM_STATUS_OK;
+	char			name[MAXNAMELEN];
+	char			line[MAXLINELEN];
+	dld_flowinfo_t		attr;
+	FILE			*fp;
+
+	if ((fp = fopen(DLADM_FLOW_DB, "r")) == NULL)
+		return (DLADM_STATUS_DB_NOTFOUND);
+
+	while (fgets(line, MAXLINELEN, fp) != NULL) {
+		/* skip comments */
+		if (BLANK_LINE(line))
+			continue;
+
+		(void) strtok(line, " \n");
+
+		s = dladm_flow_parse_db(line, &attr);
+		if (s != DLADM_STATUS_OK) {
+			status = s;
+			continue;
+		}
+		bzero(&flowdesc, sizeof (flowdesc));
+		bcopy(&attr.fi_flow_desc, &flowdesc, sizeof (flow_desc_t));
+		(void) strlcpy(name, attr.fi_flowname,
+		    sizeof (attr.fi_flowname));
+		linkid = attr.fi_linkid;
+
+		s = i_dladm_flow_add(name, linkid, &flowdesc, NULL);
+		if (s != DLADM_STATUS_OK)
+			status = s;
+	}
+	s = i_dladm_init_flowprop_db();
+	if (s != DLADM_STATUS_OK)
+		status = s;
+
+	(void) fclose(fp);
+	return (status);
+}
+
+dladm_status_t
+dladm_prefixlen2mask(int prefixlen, int maxlen, uchar_t *mask)
+{
+	if (prefixlen < 0 || prefixlen > maxlen)
+		return (DLADM_STATUS_BADARG);
+
+	while (prefixlen > 0) {
+		if (prefixlen >= 8) {
+			*mask++ = 0xFF;
+			prefixlen -= 8;
+			continue;
+		}
+		*mask |= 1 << (8 - prefixlen);
+		prefixlen--;
+	}
+	return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+dladm_mask2prefixlen(in6_addr_t *mask, int plen, int *prefixlen)
+{
+	int		bits;
+	int		i, end;
+
+	switch (plen) {
+	case IP_ABITS:
+		end = 3;
+		break;
+	case IPV6_ABITS:
+		end = 0;
+		break;
+	default:
+		return (DLADM_STATUS_BADARG);
+	}
+
+	for (i = 3; i >= end; i--) {
+		if (mask->_S6_un._S6_u32[i] == 0) {
+			plen -= 32;
+			continue;
+		}
+		bits = ffs(ntohl(mask->_S6_un._S6_u32[i])) - 1;
+		if (bits == 0)
+			break;
+		plen -= bits;
+	}
+	*prefixlen = plen;
+	return (DLADM_STATUS_OK);
+}
diff --git a/usr/src/lib/libdladm/common/libdlflow.h b/usr/src/lib/libdladm/common/libdlflow.h
new file mode 100644
index 0000000000..d35631ba4b
--- /dev/null
+++ b/usr/src/lib/libdladm/common/libdlflow.h
@@ -0,0 +1,93 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBDLFLOW_H
+#define	_LIBDLFLOW_H
+
+/*
+ * This file includes strcutures, macros and routines used by general
+ * flow administration
+ */
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <sys/mac_flow.h>
+#include <sys/dld.h>
+#include <sys/param.h>
+#include <sys/mac.h>
+#include <libdladm.h>
+#include <libdladm_impl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct dladm_flow_attr {
+	datalink_id_t		fa_linkid;
+	char			fa_flowname[MAXNAMELEN];
+	flow_desc_t		fa_flow_desc;
+	mac_resource_props_t	fa_resource_props;
+	uint64_t		fa_mask;
+	int			fa_nattr;
+} dladm_flow_attr_t;
+
+extern dladm_status_t	dladm_flow_add(datalink_id_t, dladm_arg_list_t *,
+			    dladm_arg_list_t *, char *, boolean_t,
+			    const char *);
+extern dladm_status_t	dladm_flow_remove(char *, boolean_t, const char *);
+extern dladm_status_t	dladm_flow_init(void);
+
+extern dladm_status_t	dladm_flow_parse_db(char *, dld_flowinfo_t *);
+extern dladm_status_t	dladm_walk_flow(int (*)(dladm_flow_attr_t *,
+			    void *), datalink_id_t, void *, boolean_t);
+extern dladm_status_t	dladm_flow_info(const char *, dladm_flow_attr_t *);
+
+extern dladm_status_t	dladm_set_flowprop(const char *, const char *,
+			    char **, uint_t, uint_t, char **);
+extern dladm_status_t	dladm_get_flowprop(const char *, uint32_t,
+			    const char *, char **, uint_t *);
+extern dladm_status_t	dladm_walk_flowprop(int (*)(void *, const char *),
+			    const char *, void *);
+
+extern void		dladm_flow_attr_mask(uint64_t, dladm_flow_attr_t *);
+extern dladm_status_t	dladm_flow_attr_check(dladm_arg_list_t *);
+extern dladm_status_t	dladm_prefixlen2mask(int, int, uchar_t *);
+extern dladm_status_t	dladm_mask2prefixlen(in6_addr_t *, int, int *);
+extern char		*dladm_proto2str(uint8_t);
+extern uint8_t		dladm_str2proto(const char *);
+
+extern void		dladm_flow_attr_ip2str(dladm_flow_attr_t *,
+			    char *, size_t);
+extern void		dladm_flow_attr_proto2str(dladm_flow_attr_t *,
+			    char *, size_t);
+extern void		dladm_flow_attr_port2str(dladm_flow_attr_t *,
+			    char *, size_t);
+extern void		dladm_flow_attr_dsfield2str(dladm_flow_attr_t *,
+			    char *, size_t);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LIBDLFLOW_H */
diff --git a/usr/src/lib/libdladm/common/libdlflow_impl.h b/usr/src/lib/libdladm/common/libdlflow_impl.h
new file mode 100644
index 0000000000..09b6d55bc1
--- /dev/null
+++ b/usr/src/lib/libdladm/common/libdlflow_impl.h
@@ -0,0 +1,138 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBDLFLOW_IMPL_H
+#define	_LIBDLFLOW_IMPL_H
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/mac.h>
+#include <libdladm.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct fprop_desc;
+struct fattr_desc;
+
+typedef	dladm_status_t	fpd_getf_t(const char *, char **, uint_t *);
+typedef	dladm_status_t	fpd_setf_t(const char *, val_desc_t *, uint_t);
+typedef	dladm_status_t	fpd_checkf_t(struct fprop_desc *, char **,
+			    uint_t, val_desc_t **);
+
+typedef struct fprop_desc {
+	char		*pd_name;
+	val_desc_t	pd_defval;
+	val_desc_t	*pd_modval;
+	uint_t		pd_nmodval;
+	boolean_t	pd_temponly;
+	fpd_setf_t	*pd_set;
+	fpd_getf_t	*pd_getmod;
+	fpd_getf_t	*pd_get;
+	fpd_checkf_t	*pd_check;
+} fprop_desc_t;
+
+typedef struct prop_table {
+	fprop_desc_t	*pt_table;
+	uint_t		pt_size;
+} prop_table_t;
+
+typedef enum {
+	DLADM_PROP_VAL_CURRENT = 1,
+	DLADM_PROP_VAL_DEFAULT,
+	DLADM_PROP_VAL_MODIFIABLE,
+	DLADM_PROP_VAL_PERSISTENT
+} prop_type_t;
+
+typedef	dladm_status_t	fad_checkf_t(char *, flow_desc_t *);
+
+extern dladm_status_t	do_check_ip_addr(char *, boolean_t, flow_desc_t *);
+extern dladm_status_t	do_check_dsfield(char *, flow_desc_t *);
+
+typedef struct fattr_desc {
+	const char	*ad_name;
+	fad_checkf_t	*ad_check;
+} fattr_desc_t;
+
+extern dladm_status_t	i_dladm_get_prop_temp(const char *, prop_type_t,
+			    const char *, char **, uint_t *, prop_table_t *);
+extern dladm_status_t	i_dladm_set_prop_temp(const char *, const char *,
+			    char **, uint_t, uint_t, char **, prop_table_t *);
+extern boolean_t	i_dladm_is_prop_temponly(const char *prop_name,
+			    char **, prop_table_t *);
+/*
+ * Data structures used for implementing persistent properties
+ */
+typedef struct prop_val {
+	const char		*lv_name;
+	struct prop_val		*lv_nextval;
+} prop_val_t;
+
+typedef struct prop_db_info {
+	const char		*li_name;
+	struct prop_db_info	*li_nextprop;
+	struct prop_val		*li_val;
+} prop_db_info_t;
+
+typedef struct prop_db_state	prop_db_state_t;
+
+typedef boolean_t (*prop_db_op_t)(prop_db_state_t *,
+    char *, prop_db_info_t *, dladm_status_t *);
+
+typedef dladm_status_t (*prop_db_initop_t)(const char *, const char *,
+    char **, uint_t, uint_t, char **);
+
+struct prop_db_state {
+	prop_db_op_t		ls_op;
+	const char		*ls_name;
+	const char		*ls_propname;
+	char			**ls_propval;
+	uint_t			*ls_valcntp;
+	prop_db_initop_t	ls_initop;
+};
+
+extern boolean_t	process_prop_set(prop_db_state_t *lsp, char *buf,
+			    prop_db_info_t *listp, dladm_status_t *statusp);
+extern boolean_t	process_prop_get(prop_db_state_t *lsp, char *buf,
+			    prop_db_info_t *listp, dladm_status_t *statusp);
+extern boolean_t	process_prop_init(prop_db_state_t *lsp, char *buf,
+			    prop_db_info_t *listp, dladm_status_t *statusp);
+extern dladm_status_t	process_prop_db(void *arg, FILE *fp, FILE *nfp);
+
+extern dladm_status_t	i_dladm_init_flowprop_db(void);
+extern dladm_status_t	i_dladm_set_flow_proplist_db(char *,
+    dladm_arg_list_t *);
+extern dladm_status_t	i_dladm_flow_check_restriction(datalink_id_t,
+    flow_desc_t *, mac_resource_props_t *, boolean_t);
+
+extern dladm_status_t	dladm_flow_attrlist_extract(dladm_arg_list_t *,
+    flow_desc_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LIBDLFLOW_IMPL_H */
diff --git a/usr/src/lib/libdladm/common/libdllink.c b/usr/src/lib/libdladm/common/libdllink.c
index 8deed6fe76..5698409442 100644
--- a/usr/src/lib/libdladm/common/libdllink.c
+++ b/usr/src/lib/libdladm/common/libdllink.c
@@ -62,6 +62,50 @@ i_dladm_info(int fd, const datalink_id_t linkid, dladm_attr_t *dap)
 	return (DLADM_STATUS_OK);
 }
 
+static dladm_status_t
+dladm_usagelog(dladm_logtype_t type, dld_ioc_usagelog_t *log_info)
+{
+	int		fd;
+
+	fd = open(DLD_CONTROL_DEV, O_RDWR);
+	if (fd < 0)
+		return (DLADM_STATUS_IOERR);
+
+	if (type == DLADM_LOGTYPE_FLOW)
+		log_info->ul_type = MAC_LOGTYPE_FLOW;
+	else
+		log_info->ul_type = MAC_LOGTYPE_LINK;
+
+	if (ioctl(fd, DLDIOC_USAGELOG, log_info) < 0) {
+		(void) close(fd);
+		return (DLADM_STATUS_IOERR);
+	}
+	(void) close(fd);
+	return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+dladm_start_usagelog(dladm_logtype_t type, uint_t interval)
+{
+	dld_ioc_usagelog_t	log_info;
+
+	log_info.ul_onoff = B_TRUE;
+	log_info.ul_interval = interval;
+
+	return (dladm_usagelog(type, &log_info));
+}
+
+dladm_status_t
+dladm_stop_usagelog(dladm_logtype_t type)
+{
+	dld_ioc_usagelog_t	log_info;
+
+	log_info.ul_onoff = B_FALSE;
+	log_info.ul_interval = 0;
+
+	return (dladm_usagelog(type, &log_info));
+}
+
 struct i_dladm_walk_arg {
 	dladm_walkcb_t *fn;
 	void *arg;
@@ -96,6 +140,112 @@ dladm_walk(dladm_walkcb_t *fn, void *arg, datalink_class_t class,
 	    class, dmedia, flags));
 }
 
+#define	MAXGRPPERLINK	64
+
+int
+dladm_walk_hwgrp(datalink_id_t linkid, void *arg,
+    boolean_t (*fn)(void *, dladm_hwgrp_attr_t *))
+{
+	int		fd, bufsize, ret;
+	int		nhwgrp = MAXGRPPERLINK;
+	dld_ioc_hwgrpget_t *iomp = NULL;
+
+	if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
+		return (-1);
+
+	bufsize = sizeof (dld_ioc_hwgrpget_t) +
+	    nhwgrp * sizeof (dld_hwgrpinfo_t);
+
+	if ((iomp = (dld_ioc_hwgrpget_t *)calloc(1, bufsize)) == NULL)
+		return (-1);
+
+	iomp->dih_size = nhwgrp * sizeof (dld_hwgrpinfo_t);
+	iomp->dih_linkid = linkid;
+
+	ret = ioctl(fd, DLDIOC_GETHWGRP, iomp);
+	if (ret == 0) {
+		int i;
+		dld_hwgrpinfo_t *dhip;
+		dladm_hwgrp_attr_t attr;
+
+		dhip = (dld_hwgrpinfo_t *)(iomp + 1);
+		for (i = 0; i < iomp->dih_n_groups; i++) {
+			bzero(&attr, sizeof (attr));
+
+			(void) strlcpy(attr.hg_link_name,
+			    dhip->dhi_link_name, sizeof (attr.hg_link_name));
+			attr.hg_grp_num = dhip->dhi_grp_num;
+			attr.hg_grp_type = dhip->dhi_grp_type;
+			attr.hg_n_rings = dhip->dhi_n_rings;
+			attr.hg_n_clnts = dhip->dhi_n_clnts;
+			(void) strlcpy(attr.hg_client_names,
+			    dhip->dhi_clnts, sizeof (attr.hg_client_names));
+
+			if (!(*fn)(arg, &attr))
+				break;
+			dhip++;
+		}
+	}
+	free(iomp);
+	(void) close(fd);
+	return (ret);
+}
+
+/*
+ * Invoke the specified callback for each MAC address entry defined on
+ * the specified device.
+ */
+int
+dladm_walk_macaddr(datalink_id_t linkid, void *arg,
+    boolean_t (*fn)(void *, dladm_macaddr_attr_t *))
+{
+	int		fd, bufsize, ret;
+	int		nmacaddr = 1024;
+	dld_ioc_macaddrget_t *iomp = NULL;
+
+	if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
+		return (-1);
+
+	bufsize = sizeof (dld_ioc_macaddrget_t) +
+	    nmacaddr * sizeof (dld_macaddrinfo_t);
+
+	if ((iomp = (dld_ioc_macaddrget_t *)calloc(1, bufsize)) == NULL)
+		return (-1);
+
+	iomp->dig_size = nmacaddr * sizeof (dld_macaddrinfo_t);
+	iomp->dig_linkid = linkid;
+
+	ret = ioctl(fd, DLDIOC_MACADDRGET, iomp);
+	if (ret == 0) {
+		int i;
+		dld_macaddrinfo_t *dmip;
+		dladm_macaddr_attr_t attr;
+
+		dmip = (dld_macaddrinfo_t *)(iomp + 1);
+		for (i = 0; i < iomp->dig_count; i++) {
+			bzero(&attr, sizeof (attr));
+
+			attr.ma_slot = dmip->dmi_slot;
+			attr.ma_flags = 0;
+			if (dmip->dmi_flags & DLDIOCMACADDR_USED)
+				attr.ma_flags |= DLADM_MACADDR_USED;
+			bcopy(dmip->dmi_addr, attr.ma_addr,
+			    dmip->dmi_addrlen);
+			attr.ma_addrlen = dmip->dmi_addrlen;
+			(void) strlcpy(attr.ma_client_name,
+			    dmip->dmi_client_name, MAXNAMELEN);
+			attr.ma_client_linkid = dmip->dma_client_linkid;
+
+			if (!(*fn)(arg, &attr))
+				break;
+			dmip++;
+		}
+	}
+	free(iomp);
+	(void) close(fd);
+	return (ret);
+}
+
 /*
  * These routines are used by administration tools such as dladm(1M) to
  * iterate through the list of MAC interfaces
@@ -253,84 +403,22 @@ dladm_linkduplex2str(link_duplex_t duplex, char *buf)
 /*
  * Set zoneid of a given link. Note that this function takes a link name
  * argument instead of a linkid, because a data-link (and its linkid) could
- * be created implicitly as the result of this function. For example, a VLAN
- * could be created if a VLAN PPA hack name is assigned to an exclusive
- * non-global zone.
+ * be created implicitly as the result of this function.
  */
 dladm_status_t
 dladm_setzid(const char *dlname, char *zone_name)
 {
 	datalink_id_t	linkid;
-	char		*val;
-	char		**prop_val;
-	char		link[MAXLINKNAMELEN];
-	uint_t		ppa;
-	char		dev[DLPI_LINKNAME_MAX];
-	int		valsize;
 	dladm_status_t	status = DLADM_STATUS_OK;
-	char		*prop_name = "zone";
-	boolean_t	needfree = B_FALSE;
-	char		delim = ':';
 
 	/* If the link does not exist, it is a ppa-hacked vlan. */
 	status = dladm_name2info(dlname, &linkid, NULL, NULL, NULL);
-	switch (status) {
-	case DLADM_STATUS_NOTFOUND:
-		if (strlen(dlname) > MAXLINKNAMELEN)
-			return (DLADM_STATUS_BADVAL);
-
-		if (strlen(zone_name) > ZONENAME_MAX)
-			return (DLADM_STATUS_BADVAL);
-
-		status = dladm_parselink(dlname, dev, &ppa);
-		if (status != DLADM_STATUS_OK)
-			return (status);
-
-		ppa = (uint_t)DLS_PPA2INST(ppa);
-		(void) snprintf(link, sizeof (link), "%s%d", dev, ppa);
-
-		status = dladm_name2info(link, &linkid, NULL,  NULL, NULL);
-		if (status != DLADM_STATUS_OK)
-			return (status);
-
-		/*
-		 * Since the link does not exist as yet, we've to pass the
-		 * link name too as part of data, so that the kernel can
-		 * create the link. Hence, we're packing the zone_name and
-		 * the link name into val.
-		 */
-		valsize = ZONENAME_MAX + MAXLINKNAMELEN + 1;
-		val = malloc(valsize);
-		if (val == NULL)
-			return (DLADM_STATUS_NOMEM);
-		needfree = B_TRUE;
-
-		(void) snprintf(val, valsize, "%s%c%s", zone_name,
-		    delim, dlname);
-
-		break;
-	case DLADM_STATUS_OK:
-		/*
-		 * The link exists, so only the zone_name is being passed as
-		 * val. We could also pass zone_name + linkname like in the
-		 * previous case just to maintain consistency, but other calls
-		 * like set_linkprop() in dladm.c [which is called when we run
-		 * 'dladm set-linkprop -p zone <linkname>' at the command line]
-		 * pass in the value entered at the command line [which is zone
-		 * name] as val.
-		 */
-		val = zone_name;
-		break;
-	default:
-		return (DLADM_STATUS_FAILED);
-	}
+	if (status != DLADM_STATUS_OK)
+		return (status);
 
-	prop_val = &val;
-	status = dladm_set_linkprop(linkid, prop_name, prop_val, 1,
+	status = dladm_set_linkprop(linkid, "zone", &zone_name, 1,
 	    DLADM_OPT_ACTIVE);
 
-	if (needfree)
-		free(val);
 	return (status);
 }
 
@@ -958,86 +1046,6 @@ done:
 }
 
 dladm_status_t
-dladm_get_single_mac_stat(datalink_id_t linkid, const char *name, uint8_t type,
-    void *val)
-{
-	char		module[DLPI_LINKNAME_MAX];
-	uint_t		instance;
-	char 		link[DLPI_LINKNAME_MAX];
-	dladm_status_t	status;
-	uint32_t	flags, media;
-	kstat_ctl_t	*kcp;
-	kstat_t		*ksp;
-	dladm_phys_attr_t dpap;
-
-	if ((status = dladm_datalink_id2info(linkid, &flags, NULL, &media,
-	    link, DLPI_LINKNAME_MAX)) != DLADM_STATUS_OK)
-		return (status);
-
-	if (media != DL_ETHER)
-		return (DLADM_STATUS_LINKINVAL);
-
-	status = dladm_phys_info(linkid, &dpap, DLADM_OPT_PERSIST);
-
-	if (status != DLADM_STATUS_OK)
-		return (status);
-
-	status = dladm_parselink(dpap.dp_dev, module, &instance);
-
-	if (status != DLADM_STATUS_OK)
-		return (status);
-
-	if ((kcp = kstat_open()) == NULL)
-		return (dladm_errno2status(errno));
-
-	/*
-	 * The kstat query could fail if the underlying MAC
-	 * driver was already detached.
-	 */
-	if ((ksp = kstat_lookup(kcp, module, instance, "mac")) == NULL &&
-	    (ksp = kstat_lookup(kcp, module, instance, NULL)) == NULL)
-		goto bail;
-
-	if (kstat_read(kcp, ksp, NULL) == -1)
-		goto bail;
-
-	if (dladm_kstat_value(ksp, name, type, val) < 0)
-		goto bail;
-
-	(void) kstat_close(kcp);
-	return (DLADM_STATUS_OK);
-bail:
-	(void) kstat_close(kcp);
-	return (dladm_errno2status(errno));
-
-}
-
-int
-dladm_kstat_value(kstat_t *ksp, const char *name, uint8_t type, void *buf)
-{
-	kstat_named_t	*knp;
-
-	if ((knp = kstat_data_lookup(ksp, (char *)name)) == NULL)
-		return (-1);
-
-	if (knp->data_type != type)
-		return (-1);
-
-	switch (type) {
-	case KSTAT_DATA_UINT64:
-		*(uint64_t *)buf = knp->value.ui64;
-		break;
-	case KSTAT_DATA_UINT32:
-		*(uint32_t *)buf = knp->value.ui32;
-		break;
-	default:
-		return (-1);
-	}
-
-	return (0);
-}
-
-dladm_status_t
 dladm_parselink(const char *dev, char *provider, uint_t *ppa)
 {
 	ifspec_t	ifsp;
diff --git a/usr/src/lib/libdladm/common/libdllink.h b/usr/src/lib/libdladm/common/libdllink.h
index ea51087a83..29d078470c 100644
--- a/usr/src/lib/libdladm/common/libdllink.h
+++ b/usr/src/lib/libdladm/common/libdllink.h
@@ -31,17 +31,19 @@
  * link administration (i.e. not limited to one specific type of link).
  */
 
+#include <stdio.h>
 #include <sys/types.h>
 #include <sys/param.h>
 #include <libdladm.h>
-#include <kstat.h>
+#include <libdladm_impl.h>
+#include <sys/mac_flow.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 typedef struct dladm_attr {
-	uint_t		da_max_sdu;
+	uint_t			da_max_sdu;
 } dladm_attr_t;
 
 typedef struct dladm_phys_attr {
@@ -86,6 +88,32 @@ typedef int	dladm_secobj_class_t;
 
 typedef int (dladm_walkcb_t)(const char *, void *);
 
+/* possible flags for ma_flags below */
+#define	DLADM_MACADDR_USED	0x1
+
+typedef enum {
+	DLADM_HWGRP_TYPE_RX = 0x1,
+	DLADM_HWGRP_TYPE_TX
+} dladm_hwgrp_type_t;
+
+typedef struct dladm_hwgrp_attr {
+	char		hg_link_name[MAXLINKNAMELEN];
+	uint_t		hg_grp_num;
+	dladm_hwgrp_type_t	hg_grp_type;
+	uint_t		hg_n_rings;
+	uint_t		hg_n_clnts;
+	char		hg_client_names[MAXCLIENTNAMELEN];
+} dladm_hwgrp_attr_t;
+
+typedef struct dladm_macaddr_attr {
+	uint_t		ma_slot;
+	uint_t		ma_flags;
+	uchar_t		ma_addr[MAXMACADDRLEN];
+	uint_t		ma_addrlen;
+	char		ma_client_name[MAXNAMELEN];
+	datalink_id_t	ma_client_linkid;
+} dladm_macaddr_attr_t;
+
 extern dladm_status_t	dladm_walk(dladm_walkcb_t *, void *, datalink_class_t,
 			    datalink_media_t, uint32_t);
 extern dladm_status_t	dladm_mac_walk(dladm_walkcb_t *, void *);
@@ -148,12 +176,19 @@ extern dladm_status_t	dladm_phys_delete(datalink_id_t);
 
 extern dladm_status_t	dladm_phys_info(datalink_id_t, dladm_phys_attr_t *,
 			    uint32_t);
-extern dladm_status_t	dladm_get_single_mac_stat(datalink_id_t, const char *,
-    uint8_t, void *);
-extern int		dladm_kstat_value(kstat_t *, const char *, uint8_t,
-    void *);
 extern dladm_status_t	dladm_parselink(const char *, char *, uint_t *);
 
+extern int		dladm_walk_macaddr(datalink_id_t, void *,
+			    boolean_t (*)(void *, dladm_macaddr_attr_t *));
+extern int		dladm_walk_hwgrp(datalink_id_t, void *,
+			    boolean_t (*)(void *, dladm_hwgrp_attr_t *));
+
+extern dladm_status_t	dladm_link_get_proplist(datalink_id_t,
+			    dladm_arg_list_t **);
+
+extern dladm_status_t	i_dladm_set_link_proplist_db(char *,
+			    dladm_arg_list_t *);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/lib/libdladm/common/libdlstat.c b/usr/src/lib/libdladm/common/libdlstat.c
new file mode 100644
index 0000000000..1990d27c67
--- /dev/null
+++ b/usr/src/lib/libdladm/common/libdlstat.c
@@ -0,0 +1,684 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <err.h>
+#include <errno.h>
+#include <kstat.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/dld.h>
+
+#include <libdllink.h>
+#include <libdlflow.h>
+#include <libdlstat.h>
+
+/*
+ * x86 <sys/regs> ERR conflicts with <curses.h> ERR.
+ * Include curses.h last.
+ */
+#if	defined(ERR)
+#undef  ERR
+#endif
+#include <curses.h>
+
+struct flowlist {
+	char		flowname[MAXNAMELEN];
+	datalink_id_t	linkid;
+	uint_t		ifspeed;
+	boolean_t	first;
+	boolean_t	display;
+	pktsum_t 	prevstats;
+	pktsum_t	diffstats;
+};
+
+static	int	maxx, maxy, redraw = 0;
+static	volatile uint_t handle_resize = 0, handle_break = 0;
+
+pktsum_t		totalstats;
+struct flowlist		*stattable = NULL;
+static int		statentry = -1, maxstatentries = 0;
+
+#define	STATGROWSIZE	16
+
+
+/*
+ * Search for flowlist entry in stattable which matches
+ * the flowname and linkide.  If no match is found, use
+ * next available slot.  If no slots are available,
+ * reallocate table with  more slots.
+ *
+ * Return: *flowlist of matching flow
+ *         NULL if realloc fails
+ */
+
+static struct flowlist *
+findstat(const char *flowname, datalink_id_t linkid)
+{
+	int match = 0;
+	struct flowlist *flist;
+
+	/* Look for match in the stattable */
+	for (match = 0, flist = stattable;
+	    match <= statentry;
+	    match++, flist++) {
+
+		if (flist == NULL)
+			break;
+		/* match the flowname */
+		if (flowname != NULL) {
+			if (strncmp(flowname, flist->flowname, MAXNAMELEN)
+			    == NULL)
+				return (flist);
+		/* match the linkid */
+		} else {
+			if (linkid == flist->linkid)
+				return (flist);
+		}
+	}
+
+	/*
+	 * No match found in the table.  Store statistics in the next slot.
+	 * If necessary, make room for this entry.
+	 */
+	statentry++;
+	if ((maxstatentries == 0) || (maxstatentries == statentry)) {
+		maxstatentries += STATGROWSIZE;
+		stattable = realloc(stattable,
+		    maxstatentries * sizeof (struct flowlist));
+		if (stattable == NULL) {
+			perror("realloc");
+			return (struct flowlist *)(NULL);
+		}
+	}
+	flist = &stattable[statentry];
+	bzero(flist, sizeof (struct flowlist));
+	flist->first = B_TRUE;
+
+	if (flowname != NULL)
+		(void) strncpy(flist->flowname, flowname, MAXNAMELEN);
+	flist->linkid = linkid;
+	return (flist);
+}
+
+static void
+print_flow_stats(struct flowlist *flist)
+{
+	struct flowlist *fcurr;
+	double ikbs, okbs;
+	double ipks, opks;
+	double dlt;
+	int fcount;
+	static boolean_t first = B_TRUE;
+
+	if (first) {
+		first = B_FALSE;
+		(void) printw("please wait...\n");
+		return;
+	}
+
+	for (fcount = 0, fcurr = flist;
+	    fcount <= statentry;
+	    fcount++, fcurr++) {
+		if (fcurr->flowname && fcurr->display) {
+			char linkname[MAXNAMELEN];
+
+			(void) dladm_datalink_id2info(fcurr->linkid, NULL, NULL,
+			    NULL, linkname, sizeof (linkname));
+			dlt = (double)fcurr->diffstats.snaptime/(double)NANOSEC;
+			ikbs = fcurr->diffstats.rbytes * 8 / dlt / 1024;
+			okbs = fcurr->diffstats.obytes * 8 / dlt / 1024;
+			ipks = fcurr->diffstats.ipackets / dlt;
+			opks = fcurr->diffstats.opackets / dlt;
+			(void) printw("%-15.15s", fcurr->flowname);
+			(void) printw("%-10.10s", linkname);
+			(void) printw("%9.2f %9.2f %9.2f %9.2f ",
+			    ikbs, okbs, ipks, opks);
+			(void) printw("\n");
+		}
+	}
+}
+
+/*ARGSUSED*/
+static int
+flow_kstats(dladm_flow_attr_t *attr, void *arg)
+{
+	kstat_ctl_t 	*kcp = (kstat_ctl_t *)arg;
+	kstat_t		*ksp;
+	struct flowlist	*flist;
+	pktsum_t	currstats, *prevstats, *diffstats;
+
+	flist = findstat(attr->fa_flowname, attr->fa_linkid);
+	if (flist != NULL) {
+		prevstats = &flist->prevstats;
+		diffstats = &flist->diffstats;
+	} else {
+		return (DLADM_STATUS_FAILED);
+	}
+
+	/* lookup kstat entry */
+	ksp = dladm_kstat_lookup(kcp, NULL, -1, attr->fa_flowname, "flow");
+
+	if (ksp == NULL)
+		return (DLADM_WALK_TERMINATE);
+	else
+		flist->display = B_TRUE;
+
+	dladm_get_stats(kcp, ksp, &currstats);
+	if (flist->ifspeed == 0)
+		(void) dladm_kstat_value(ksp, "ifspeed", KSTAT_DATA_UINT64,
+		    &flist->ifspeed);
+
+	if (flist->first)
+		flist->first = B_FALSE;
+	else {
+		dladm_stats_diff(diffstats, &currstats, prevstats);
+		dladm_stats_total(&totalstats, diffstats, &totalstats);
+	}
+
+	bcopy(&currstats, prevstats, sizeof (pktsum_t));
+	return (DLADM_WALK_CONTINUE);
+}
+
+static void
+print_link_stats(struct flowlist *flist)
+{
+	struct flowlist *fcurr;
+	double ikbs, okbs;
+	double ipks, opks;
+	double util;
+	double dlt;
+	int fcount;
+	static boolean_t first = B_TRUE;
+
+	if (first) {
+		first = B_FALSE;
+		(void) printw("please wait...\n");
+		return;
+	}
+
+	for (fcount = 0, fcurr = flist;
+	    fcount <= statentry;
+	    fcount++, fcurr++) {
+		if ((fcurr->linkid != DATALINK_INVALID_LINKID) &&
+		    fcurr->display)  {
+			char linkname[MAXNAMELEN];
+
+			(void) dladm_datalink_id2info(fcurr->linkid, NULL, NULL,
+			    NULL, linkname, sizeof (linkname));
+			dlt = (double)fcurr->diffstats.snaptime/(double)NANOSEC;
+			ikbs = (double)fcurr->diffstats.rbytes * 8 / dlt / 1024;
+			okbs = (double)fcurr->diffstats.obytes * 8 / dlt / 1024;
+			ipks = (double)fcurr->diffstats.ipackets / dlt;
+			opks = (double)fcurr->diffstats.opackets / dlt;
+			(void) printw("%-10.10s", linkname);
+			(void) printw("%9.2f %9.2f %9.2f %9.2f ",
+			    ikbs, okbs, ipks, opks);
+			if (fcurr->ifspeed != 0)
+				util = ((ikbs + okbs) * 1024) *
+				    100/ fcurr->ifspeed;
+			else
+				util = (double)0;
+			(void) attron(A_BOLD);
+			(void) printw("    %6.2f", util);
+			(void) attroff(A_BOLD);
+			(void) printw("\n");
+		}
+	}
+}
+
+/*
+ * This function is called through the dladm_walk_datalink_id() walker and
+ * calls the dladm_walk_flow() walker.
+ */
+
+/*ARGSUSED*/
+static int
+link_flowstats(datalink_id_t linkid, void *arg)
+{
+	return (dladm_walk_flow(flow_kstats, linkid, arg, B_FALSE));
+}
+
+/*ARGSUSED*/
+static int
+link_kstats(datalink_id_t linkid, void *arg)
+{
+	kstat_ctl_t	*kcp = (kstat_ctl_t *)arg;
+	struct flowlist	*flist;
+	pktsum_t	currstats, *prevstats, *diffstats;
+	kstat_t		*ksp;
+	char		linkname[MAXNAMELEN];
+
+	/* find the flist entry */
+	flist = findstat(NULL, linkid);
+	if (flist != NULL) {
+		prevstats = &flist->prevstats;
+		diffstats = &flist->diffstats;
+	} else {
+		return (DLADM_WALK_CONTINUE);
+	}
+
+	/* lookup kstat entry */
+	(void) dladm_datalink_id2info(linkid, NULL, NULL, NULL, linkname,
+	    sizeof (linkname));
+
+	if (linkname == NULL) {
+		warn("no linkname for linkid");
+		return (DLADM_WALK_TERMINATE);
+	}
+
+	ksp = dladm_kstat_lookup(kcp, NULL, -1, linkname, "net");
+
+	if (ksp == NULL)
+		return (DLADM_WALK_TERMINATE);
+	else
+		flist->display = B_TRUE;
+
+	/* read packet and byte stats */
+	dladm_get_stats(kcp, ksp, &currstats);
+
+	if (flist->ifspeed == 0)
+		(void) dladm_kstat_value(ksp, "ifspeed", KSTAT_DATA_UINT64,
+		    &flist->ifspeed);
+
+	if (flist->first == B_TRUE)
+		flist->first = B_FALSE;
+	else
+		dladm_stats_diff(diffstats, &currstats, prevstats);
+
+	bcopy(&currstats, prevstats, sizeof (*prevstats));
+
+	return (DLADM_WALK_CONTINUE);
+}
+
+/*ARGSUSED*/
+static void
+sig_break(int s)
+{
+	handle_break = 1;
+}
+
+/*ARGSUSED*/
+static void
+sig_resize(int s)
+{
+	handle_resize = 1;
+}
+
+static void
+curses_init()
+{
+	maxx = maxx;	/* lint */
+	maxy = maxy;	/* lint */
+
+	/* Install signal handlers */
+	(void) signal(SIGINT,  sig_break);
+	(void) signal(SIGQUIT, sig_break);
+	(void) signal(SIGTERM, sig_break);
+	(void) signal(SIGWINCH, sig_resize);
+
+	/* Initialize ncurses */
+	(void) initscr();
+	(void) cbreak();
+	(void) noecho();
+	(void) curs_set(0);
+	timeout(0);
+	getmaxyx(stdscr, maxy, maxx);
+}
+
+static void
+curses_fin()
+{
+	(void) printw("\n");
+	(void) curs_set(1);
+	(void) nocbreak();
+	(void) endwin();
+
+	free(stattable);
+}
+
+static void
+stat_report(kstat_ctl_t *kcp,  datalink_id_t linkid, const char *flowname,
+    int opt)
+{
+
+	double dlt, ikbs, okbs, ipks, opks;
+
+	struct flowlist *fstable = stattable;
+
+	if ((opt != LINK_REPORT) && (opt != FLOW_REPORT))
+		return;
+
+	/* Handle window resizes */
+	if (handle_resize) {
+		(void) endwin();
+		(void) initscr();
+		(void) cbreak();
+		(void) noecho();
+		(void) curs_set(0);
+		timeout(0);
+		getmaxyx(stdscr, maxy, maxx);
+		redraw = 1;
+		handle_resize = 0;
+	}
+
+	/* Print title */
+	(void) erase();
+	(void) attron(A_BOLD);
+	(void) move(0, 0);
+	if (opt == FLOW_REPORT)
+		(void) printw("%-15.15s", "Flow");
+	(void) printw("%-10.10s", "Link");
+	(void) printw("%9.9s %9.9s %9.9s %9.9s ",
+	    "iKb/s", "oKb/s", "iPk/s", "oPk/s");
+	if (opt == LINK_REPORT)
+		(void) printw("    %6.6s", "%Util");
+	(void) printw("\n");
+	(void) attroff(A_BOLD);
+
+	(void) move(2, 0);
+
+	/* Print stats for each link or flow */
+	bzero(&totalstats, sizeof (totalstats));
+	if (opt == LINK_REPORT) {
+		/* Display all links */
+		if (linkid == DATALINK_ALL_LINKID) {
+			(void) dladm_walk_datalink_id(link_kstats,
+			    (void *)kcp, DATALINK_CLASS_ALL,
+			    DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE);
+		/* Display 1 link */
+		} else {
+			(void) link_kstats(linkid, kcp);
+		}
+		print_link_stats(fstable);
+
+	} else if (opt == FLOW_REPORT) {
+		/* Display 1 flow */
+		if (flowname != NULL) {
+			dladm_flow_attr_t fattr;
+			if (dladm_flow_info(flowname, &fattr) !=
+			    DLADM_STATUS_OK)
+				return;
+			(void) flow_kstats(&fattr, kcp);
+		/* Display all flows on all links */
+		} else if (linkid == DATALINK_ALL_LINKID) {
+			(void) dladm_walk_datalink_id(link_flowstats,
+			    (void *)kcp, DATALINK_CLASS_ALL,
+			    DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE);
+		/* Display all flows on a link */
+		} else if (linkid != DATALINK_INVALID_LINKID) {
+			(void) dladm_walk_flow(flow_kstats, linkid, kcp,
+			    B_FALSE);
+		}
+		print_flow_stats(fstable);
+
+		/* Print totals */
+		(void) attron(A_BOLD);
+		dlt = (double)totalstats.snaptime / (double)NANOSEC;
+		ikbs = totalstats.rbytes / dlt / 1024;
+		okbs = totalstats.obytes / dlt / 1024;
+		ipks = totalstats.ipackets / dlt;
+		opks = totalstats.opackets / dlt;
+		(void) printw("\n%-25.25s", "Totals");
+		(void) printw("%9.2f %9.2f %9.2f %9.2f ",
+		    ikbs, okbs, ipks, opks);
+		(void) attroff(A_BOLD);
+	}
+
+	if (redraw)
+		(void) clearok(stdscr, 1);
+
+	if (refresh() == ERR)
+		return;
+
+	if (redraw) {
+		(void) clearok(stdscr, 0);
+		redraw = 0;
+	}
+}
+
+/* Exported functions */
+
+/*
+ * Continuously display link or flow statstics using a libcurses
+ * based display.
+ */
+
+void
+dladm_continuous(datalink_id_t linkid, const char *flowname, int interval,
+    int opt)
+{
+	kstat_ctl_t *kcp;
+
+	if ((kcp = kstat_open()) == NULL) {
+		warn("kstat open operation failed");
+		return;
+	}
+
+	curses_init();
+
+	for (;;) {
+
+		if (handle_break)
+			break;
+
+		stat_report(kcp, linkid, flowname, opt);
+
+		(void) sleep(max(1, interval));
+	}
+
+	(void) curses_fin();
+	(void) kstat_close(kcp);
+}
+
+/*
+ * dladm_kstat_lookup() is a modified version of kstat_lookup which
+ * adds the class as a selector.
+ */
+
+kstat_t *
+dladm_kstat_lookup(kstat_ctl_t *kcp, const char *module, int instance,
+    const char *name, const char *class)
+{
+	kstat_t *ksp = NULL;
+
+	for (ksp = kcp->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
+		if ((module == NULL || strcmp(ksp->ks_module, module) == 0) &&
+		    (instance == -1 || ksp->ks_instance == instance) &&
+		    (name == NULL || strcmp(ksp->ks_name, name) == 0) &&
+		    (class == NULL || strcmp(ksp->ks_class, class) == 0))
+			return (ksp);
+	}
+
+	errno = ENOENT;
+	return (NULL);
+}
+
+/*
+ * dladm_get_stats() populates the supplied pktsum_t structure with
+ * the input and output  packet and byte kstats from the kstat_t
+ * found with dladm_kstat_lookup.
+ */
+void
+dladm_get_stats(kstat_ctl_t *kcp, kstat_t *ksp, pktsum_t *stats)
+{
+
+	if (kstat_read(kcp, ksp, NULL) == -1)
+		return;
+
+	stats->snaptime = gethrtime();
+
+	if (dladm_kstat_value(ksp, "ipackets64", KSTAT_DATA_UINT64,
+	    &stats->ipackets) < 0) {
+		if (dladm_kstat_value(ksp, "ipackets", KSTAT_DATA_UINT64,
+		    &stats->ipackets) < 0)
+			return;
+	}
+
+	if (dladm_kstat_value(ksp, "opackets64", KSTAT_DATA_UINT64,
+	    &stats->opackets) < 0) {
+		if (dladm_kstat_value(ksp, "opackets", KSTAT_DATA_UINT64,
+		    &stats->opackets) < 0)
+			return;
+	}
+
+	if (dladm_kstat_value(ksp, "rbytes64", KSTAT_DATA_UINT64,
+	    &stats->rbytes) < 0) {
+		if (dladm_kstat_value(ksp, "rbytes", KSTAT_DATA_UINT64,
+		    &stats->rbytes) < 0)
+			return;
+	}
+
+	if (dladm_kstat_value(ksp, "obytes64", KSTAT_DATA_UINT64,
+	    &stats->obytes) < 0) {
+		if (dladm_kstat_value(ksp, "obytes", KSTAT_DATA_UINT64,
+		    &stats->obytes) < 0)
+			return;
+	}
+
+	if (dladm_kstat_value(ksp, "ierrors", KSTAT_DATA_UINT32,
+	    &stats->ierrors) < 0) {
+		if (dladm_kstat_value(ksp, "ierrors", KSTAT_DATA_UINT64,
+		    &stats->ierrors) < 0)
+		return;
+	}
+
+	if (dladm_kstat_value(ksp, "oerrors", KSTAT_DATA_UINT32,
+	    &stats->oerrors) < 0) {
+		if (dladm_kstat_value(ksp, "oerrors", KSTAT_DATA_UINT64,
+		    &stats->oerrors) < 0)
+			return;
+	}
+}
+
+int
+dladm_kstat_value(kstat_t *ksp, const char *name, uint8_t type, void *buf)
+{
+	kstat_named_t	*knp;
+
+	if ((knp = kstat_data_lookup(ksp, (char *)name)) == NULL)
+		return (-1);
+
+	if (knp->data_type != type)
+		return (-1);
+
+	switch (type) {
+	case KSTAT_DATA_UINT64:
+		*(uint64_t *)buf = knp->value.ui64;
+		break;
+	case KSTAT_DATA_UINT32:
+		*(uint32_t *)buf = knp->value.ui32;
+		break;
+	default:
+		return (-1);
+	}
+
+	return (0);
+}
+
+dladm_status_t
+dladm_get_single_mac_stat(datalink_id_t linkid, const char *name, uint8_t type,
+    void *val)
+{
+	kstat_ctl_t	*kcp;
+	char		module[DLPI_LINKNAME_MAX];
+	uint_t		instance;
+	char 		link[DLPI_LINKNAME_MAX];
+	dladm_status_t	status;
+	uint32_t	flags, media;
+	kstat_t		*ksp;
+	dladm_phys_attr_t dpap;
+
+	if ((kcp = kstat_open()) == NULL) {
+		warn("kstat_open operation failed");
+		return (-1);
+	}
+
+	if ((status = dladm_datalink_id2info(linkid, &flags, NULL, &media,
+	    link, DLPI_LINKNAME_MAX)) != DLADM_STATUS_OK)
+		return (status);
+
+	if (media != DL_ETHER)
+		return (DLADM_STATUS_LINKINVAL);
+
+	status = dladm_phys_info(linkid, &dpap, DLADM_OPT_PERSIST);
+
+	if (status != DLADM_STATUS_OK)
+		return (status);
+
+	status = dladm_parselink(dpap.dp_dev, module, &instance);
+
+	if (status != DLADM_STATUS_OK)
+		return (status);
+
+	/*
+	 * The kstat query could fail if the underlying MAC
+	 * driver was already detached.
+	 */
+	if ((ksp = kstat_lookup(kcp, module, instance, "mac")) == NULL &&
+	    (ksp = kstat_lookup(kcp, module, instance, NULL)) == NULL)
+		goto bail;
+
+	if (kstat_read(kcp, ksp, NULL) == -1)
+		goto bail;
+
+	if (dladm_kstat_value(ksp, name, type, val) < 0)
+		goto bail;
+
+	(void) kstat_close(kcp);
+	return (DLADM_STATUS_OK);
+
+bail:
+	(void) kstat_close(kcp);
+	return (dladm_errno2status(errno));
+}
+
+/* Compute sum of 2 pktsums (s1 = s2 + s3) */
+void
+dladm_stats_total(pktsum_t *s1, pktsum_t *s2, pktsum_t *s3)
+{
+	s1->rbytes    = s2->rbytes    + s3->rbytes;
+	s1->ipackets  = s2->ipackets  + s3->ipackets;
+	s1->ierrors   = s2->ierrors   + s3->ierrors;
+	s1->obytes    = s2->obytes    + s3->obytes;
+	s1->opackets  = s2->opackets  + s3->opackets;
+	s1->oerrors   = s2->oerrors   + s3->oerrors;
+	s1->snaptime  = s2->snaptime;
+}
+
+/* Compute differences between 2 pktsums (s1 = s2 - s3) */
+void
+dladm_stats_diff(pktsum_t *s1, pktsum_t *s2, pktsum_t *s3)
+{
+	s1->rbytes    = s2->rbytes    - s3->rbytes;
+	s1->ipackets  = s2->ipackets  - s3->ipackets;
+	s1->ierrors   = s2->ierrors   - s3->ierrors;
+	s1->obytes    = s2->obytes    - s3->obytes;
+	s1->opackets  = s2->opackets  - s3->opackets;
+	s1->oerrors   = s2->oerrors   - s3->oerrors;
+	s1->snaptime  = s2->snaptime  - s3->snaptime;
+}
diff --git a/usr/src/lib/libdladm/common/libdlstat.h b/usr/src/lib/libdladm/common/libdlstat.h
new file mode 100644
index 0000000000..a142275268
--- /dev/null
+++ b/usr/src/lib/libdladm/common/libdlstat.h
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBDLSTAT_H
+#define	_LIBDLSTAT_H
+
+/*
+ * This file includes structures, macros and common routines shared by all
+ * data-link administration, and routines which are used to retrieve and
+ * display statistics.
+ */
+
+#include <kstat.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	LINK_REPORT	1
+#define	FLOW_REPORT	2
+
+typedef struct pktsum_s {
+	hrtime_t	snaptime;
+	uint64_t	ipackets;
+	uint64_t	opackets;
+	uint64_t	rbytes;
+	uint64_t	obytes;
+	uint64_t	ierrors;
+	uint64_t	oerrors;
+} pktsum_t;
+
+extern void		dladm_continuous(datalink_id_t, const char *, int, int);
+
+extern kstat_t		*dladm_kstat_lookup(kstat_ctl_t *, const char *, int,
+			    const char *, const char *);
+extern void		dladm_get_stats(kstat_ctl_t *, kstat_t *, pktsum_t *);
+extern int		dladm_kstat_value(kstat_t *, const char *, uint8_t,
+			    void *);
+extern dladm_status_t	dladm_get_single_mac_stat(datalink_id_t, const char *,
+			    uint8_t, void *);
+
+extern void		dladm_stats_total(pktsum_t *, pktsum_t *, pktsum_t *);
+extern void		dladm_stats_diff(pktsum_t *, pktsum_t *, pktsum_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LIBDLSTAT_H */
diff --git a/usr/src/lib/libdladm/common/libdlvlan.c b/usr/src/lib/libdladm/common/libdlvlan.c
index f6d855db72..1dc04bf4eb 100644
--- a/usr/src/lib/libdladm/common/libdlvlan.c
+++ b/usr/src/lib/libdladm/common/libdlvlan.c
@@ -23,16 +23,8 @@
  * Use is subject to license terms.
  */
 
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
-#include <assert.h>
-#include <sys/dld.h>
-#include <libdladm_impl.h>
-#include <libdllink.h>
 #include <libdlvlan.h>
+#include <libdlvnic.h>
 
 /*
  * VLAN Administration Library.
@@ -44,106 +36,19 @@
 /*
  * Returns the current attributes of the specified VLAN.
  */
-static dladm_status_t
-i_dladm_vlan_info_active(datalink_id_t vlanid, dladm_vlan_attr_t *dvap)
-{
-	int			fd;
-	dld_ioc_vlan_attr_t	div;
-	dladm_status_t		status = DLADM_STATUS_OK;
-
-	if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
-		return (dladm_errno2status(errno));
-
-	div.div_vlanid = vlanid;
-
-	if (ioctl(fd, DLDIOC_VLAN_ATTR, &div) < 0)
-		status = dladm_errno2status(errno);
-
-	dvap->dv_vid = div.div_vid;
-	dvap->dv_linkid = div.div_linkid;
-	dvap->dv_force = div.div_force;
-	dvap->dv_implicit = div.div_implicit;
-done:
-	(void) close(fd);
-	return (status);
-}
-
-/*
- * Returns the persistent attributes of the specified VLAN.
- */
-static dladm_status_t
-i_dladm_vlan_info_persist(datalink_id_t vlanid, dladm_vlan_attr_t *dvap)
-{
-	dladm_conf_t	conf = DLADM_INVALID_CONF;
-	dladm_status_t	status;
-	uint64_t	u64;
-
-	if ((status = dladm_read_conf(vlanid, &conf)) != DLADM_STATUS_OK)
-		return (status);
-
-	status = dladm_get_conf_field(conf, FLINKOVER, &u64, sizeof (u64));
-	if (status != DLADM_STATUS_OK)
-		goto done;
-	dvap->dv_linkid = (datalink_id_t)u64;
-
-	status = dladm_get_conf_field(conf, FFORCE, &dvap->dv_force,
-	    sizeof (boolean_t));
-	if (status != DLADM_STATUS_OK)
-		goto done;
-
-	dvap->dv_implicit = B_FALSE;
-
-	status = dladm_get_conf_field(conf, FVLANID, &u64, sizeof (u64));
-	if (status != DLADM_STATUS_OK)
-		goto done;
-	dvap->dv_vid = (uint16_t)u64;
-
-done:
-	dladm_destroy_conf(conf);
-	return (status);
-}
-
 dladm_status_t
 dladm_vlan_info(datalink_id_t vlanid, dladm_vlan_attr_t *dvap, uint32_t flags)
 {
-	assert(flags == DLADM_OPT_ACTIVE || flags == DLADM_OPT_PERSIST);
-	if (flags == DLADM_OPT_ACTIVE)
-		return (i_dladm_vlan_info_active(vlanid, dvap));
-	else
-		return (i_dladm_vlan_info_persist(vlanid, dvap));
-}
-
-static dladm_status_t
-dladm_persist_vlan_conf(const char *vlan, datalink_id_t vlanid,
-    boolean_t force, datalink_id_t linkid, uint16_t vid)
-{
-	dladm_conf_t	conf = DLADM_INVALID_CONF;
-	dladm_status_t	status;
-	uint64_t	u64;
+	dladm_status_t status;
+	dladm_vnic_attr_t attr, *vnic = &attr;
 
-	if ((status = dladm_create_conf(vlan, vlanid, DATALINK_CLASS_VLAN,
-	    DL_ETHER, &conf)) != DLADM_STATUS_OK) {
+	if ((status = dladm_vnic_info(vlanid, vnic, flags)) !=
+	    DLADM_STATUS_OK)
 		return (status);
-	}
 
-	u64 = linkid;
-	status = dladm_set_conf_field(conf, FLINKOVER, DLADM_TYPE_UINT64, &u64);
-	if (status != DLADM_STATUS_OK)
-		goto done;
-
-	status = dladm_set_conf_field(conf, FFORCE, DLADM_TYPE_BOOLEAN, &force);
-	if (status != DLADM_STATUS_OK)
-		goto done;
-
-	u64 = vid;
-	status = dladm_set_conf_field(conf, FVLANID, DLADM_TYPE_UINT64, &u64);
-	if (status != DLADM_STATUS_OK)
-		goto done;
-
-	status = dladm_write_conf(conf);
-
-done:
-	dladm_destroy_conf(conf);
+	dvap->dv_vid = vnic->va_vid;
+	dvap->dv_linkid = vnic->va_link_id;
+	dvap->dv_force = vnic->va_force;
 	return (status);
 }
 
@@ -152,63 +57,11 @@ done:
  */
 dladm_status_t
 dladm_vlan_create(const char *vlan, datalink_id_t linkid, uint16_t vid,
-    uint32_t flags)
+    dladm_arg_list_t *proplist, uint32_t flags, datalink_id_t *vlan_id_out)
 {
-	dld_ioc_create_vlan_t	dic;
-	int			fd;
-	datalink_id_t		vlanid = DATALINK_INVALID_LINKID;
-	uint_t			media;
-	datalink_class_t	class;
-	dladm_status_t		status;
-
-	if (vid < 1 || vid > 4094)
-		return (DLADM_STATUS_VIDINVAL);
-
-	if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
-		return (dladm_errno2status(errno));
-
-	status = dladm_datalink_id2info(linkid, NULL, &class, &media, NULL, 0);
-	if (status != DLADM_STATUS_OK || media != DL_ETHER ||
-	    class == DATALINK_CLASS_VLAN) {
-		return (DLADM_STATUS_BADARG);
-	}
-
-	status = dladm_create_datalink_id(vlan, DATALINK_CLASS_VLAN, DL_ETHER,
-	    flags & (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST), &vlanid);
-	if (status != DLADM_STATUS_OK)
-		goto fail;
-
-	if (flags & DLADM_OPT_PERSIST) {
-		status = dladm_persist_vlan_conf(vlan, vlanid,
-		    (flags & DLADM_OPT_FORCE) != 0, linkid, vid);
-		if (status != DLADM_STATUS_OK)
-			goto fail;
-	}
-
-	if (flags & DLADM_OPT_ACTIVE) {
-		dic.dic_vlanid = vlanid;
-		dic.dic_linkid = linkid;
-		dic.dic_vid = vid;
-		dic.dic_force = (flags & DLADM_OPT_FORCE) != 0;
-
-		if (ioctl(fd, DLDIOC_CREATE_VLAN, &dic) < 0) {
-			status = dladm_errno2status(errno);
-			if (flags & DLADM_OPT_PERSIST)
-				(void) dladm_remove_conf(vlanid);
-			goto fail;
-		}
-	}
-
-	(void) close(fd);
-	return (DLADM_STATUS_OK);
-
-fail:
-	if (vlanid != DATALINK_INVALID_LINKID) {
-		(void) dladm_destroy_datalink_id(vlanid,
-		    flags & (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST));
-	}
-	(void) close(fd);
-	return (status);
+	return (dladm_vnic_create(vlan, linkid, VNIC_MAC_ADDR_TYPE_PRIMARY,
+	    NULL, 0, NULL, 0, vid, vlan_id_out, proplist,
+	    flags | DLADM_OPT_VLAN));
 }
 
 /*
@@ -217,124 +70,11 @@ fail:
 dladm_status_t
 dladm_vlan_delete(datalink_id_t vlanid, uint32_t flags)
 {
-	dld_ioc_delete_vlan_t	did;
-	int			fd;
-	datalink_class_t	class;
-	dladm_status_t		status = DLADM_STATUS_OK;
-
-	if ((dladm_datalink_id2info(vlanid, NULL, &class, NULL, NULL, 0) !=
-	    DLADM_STATUS_OK) || (class != DATALINK_CLASS_VLAN)) {
-		return (DLADM_STATUS_BADARG);
-	}
-
-	if (flags & DLADM_OPT_ACTIVE) {
-		if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
-			return (dladm_errno2status(errno));
-
-		did.did_linkid = vlanid;
-		if ((ioctl(fd, DLDIOC_DELETE_VLAN, &did) < 0) &&
-		    ((errno != ENOENT) || !(flags & DLADM_OPT_PERSIST))) {
-			(void) close(fd);
-			return (dladm_errno2status(errno));
-		}
-		(void) close(fd);
-
-		/*
-		 * Delete active linkprop before this active link is deleted.
-		 */
-		(void) dladm_set_linkprop(vlanid, NULL, NULL, 0,
-		    DLADM_OPT_ACTIVE);
-	}
-
-	(void) dladm_destroy_datalink_id(vlanid,
-	    flags & (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST));
-
-	if (flags & DLADM_OPT_PERSIST)
-		(void) dladm_remove_conf(vlanid);
-
-	return (status);
-}
-
-/*
- * Callback used by dladm_vlan_up()
- */
-static int
-i_dladm_vlan_up(datalink_id_t vlanid, void *arg)
-{
-	dladm_vlan_attr_t	dva;
-	dld_ioc_create_vlan_t	dic;
-	dladm_status_t		*statusp = arg;
-	uint32_t		flags;
-	int			fd;
-	dladm_status_t		status;
-
-	status = dladm_vlan_info(vlanid, &dva, DLADM_OPT_PERSIST);
-	if (status != DLADM_STATUS_OK)
-		goto done;
-
-	/*
-	 * Validate (and delete) the link associated with this VLAN, see if
-	 * the specific hardware has been removed during system shutdown.
-	 */
-	if ((status = dladm_datalink_id2info(dva.dv_linkid, &flags, NULL,
-	    NULL, NULL, 0)) != DLADM_STATUS_OK) {
-		goto done;
-	}
-
-	if (!(flags & DLADM_OPT_ACTIVE)) {
-		status = DLADM_STATUS_BADARG;
-		goto done;
-	}
-
-	dic.dic_linkid = dva.dv_linkid;
-	dic.dic_force = dva.dv_force;
-	dic.dic_vid = dva.dv_vid;
-
-	if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) {
-		status = dladm_errno2status(errno);
-		goto done;
-	}
-
-	dic.dic_vlanid = vlanid;
-	if (ioctl(fd, DLDIOC_CREATE_VLAN, &dic) < 0) {
-		status = dladm_errno2status(errno);
-		goto done;
-	}
-
-	if ((status = dladm_up_datalink_id(vlanid)) != DLADM_STATUS_OK) {
-		dld_ioc_delete_vlan_t did;
-
-		did.did_linkid = vlanid;
-		(void) ioctl(fd, DLDIOC_DELETE_VLAN, &did);
-	} else {
-		/*
-		 * Reset the active linkprop of this specific link.
-		 */
-		(void) dladm_init_linkprop(vlanid, B_FALSE);
-	}
-
-	(void) close(fd);
-done:
-	*statusp = status;
-	return (DLADM_WALK_CONTINUE);
+	return (dladm_vnic_delete(vlanid, flags | DLADM_OPT_VLAN));
 }
 
-/*
- * Bring up one VLAN, or all persistent VLANs.  In the latter case, the
- * walk may terminate early if bringup of a VLAN fails.
- */
 dladm_status_t
 dladm_vlan_up(datalink_id_t linkid)
 {
-	dladm_status_t	status;
-
-	if (linkid == DATALINK_ALL_LINKID) {
-		(void) dladm_walk_datalink_id(i_dladm_vlan_up, &status,
-		    DATALINK_CLASS_VLAN, DATALINK_ANY_MEDIATYPE,
-		    DLADM_OPT_PERSIST);
-		return (DLADM_STATUS_OK);
-	} else {
-		(void) i_dladm_vlan_up(linkid, &status);
-		return (status);
-	}
+	return (dladm_vnic_up(linkid, DLADM_OPT_VLAN));
 }
diff --git a/usr/src/lib/libdladm/common/libdlvlan.h b/usr/src/lib/libdladm/common/libdlvlan.h
index 7a305443df..91f6ee8671 100644
--- a/usr/src/lib/libdladm/common/libdlvlan.h
+++ b/usr/src/lib/libdladm/common/libdlvlan.h
@@ -26,8 +26,6 @@
 #ifndef _LIBDLVLAN_H
 #define	_LIBDLVLAN_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This file includes structures, macros and routines used by VLAN link
  * administration.
@@ -43,13 +41,13 @@ typedef struct dladm_vlan_attr {
 	uint16_t	dv_vid;
 	datalink_id_t	dv_linkid;
 	boolean_t	dv_force;
-	boolean_t	dv_implicit;
 } dladm_vlan_attr_t;
 
 extern dladm_status_t	dladm_vlan_info(datalink_id_t, dladm_vlan_attr_t *,
 			    uint32_t);
 extern dladm_status_t	dladm_vlan_create(const char *, datalink_id_t,
-			    uint16_t, uint32_t);
+			    uint16_t, dladm_arg_list_t *, uint32_t,
+			    datalink_id_t *);
 extern dladm_status_t	dladm_vlan_delete(datalink_id_t, uint32_t);
 extern dladm_status_t	dladm_vlan_up(datalink_id_t);
 
diff --git a/usr/src/lib/libdladm/common/libdlvnic.c b/usr/src/lib/libdladm/common/libdlvnic.c
index ac97372785..dfa58bcac5 100644
--- a/usr/src/lib/libdladm/common/libdlvnic.c
+++ b/usr/src/lib/libdladm/common/libdlvnic.c
@@ -36,6 +36,7 @@
 #include <libintl.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
+#include <sys/dld.h>
 #include <libdladm_impl.h>
 #include <libdllink.h>
 #include <libdlvnic.h>
@@ -44,137 +45,258 @@
  * VNIC administration library.
  */
 
-/* Limits on buffer size for VNIC_IOC_INFO request */
-#define	MIN_INFO_SIZE (4*1024)
-#define	MAX_INFO_SIZE (128*1024)
-
-/* configuration database entry */
-typedef struct dladm_vnic_attr_db {
-	datalink_id_t	vt_vnic_id;
-	datalink_id_t	vt_link_id;
-	vnic_mac_addr_type_t vt_mac_addr_type;
-	uint_t		vt_mac_len;
-	uchar_t		vt_mac_addr[MAXMACADDRLEN];
-} dladm_vnic_attr_db_t;
-
-typedef struct dladm_vnic_modify_attr {
-	vnic_mac_addr_type_t	vm_mac_addr_type;
-	int			vm_mac_len;
-	uchar_t			vm_mac_addr[MAXMACADDRLEN];
-} dladm_vnic_modify_attr_t;
+/*
+ * Default random MAC address prefix (locally administered).
+ */
+static char dladm_vnic_def_prefix[] = {0x02, 0x08, 0x20};
+
+static dladm_status_t	dladm_vnic_persist_conf(const char *name,
+			    dladm_vnic_attr_t *, datalink_class_t);
+static const char	*dladm_vnic_macaddr2str(const uchar_t *, char *);
+static dladm_status_t	dladm_vnic_str2macaddr(const char *, uchar_t *);
 
 /*
- * Send a create command to the VNIC driver.
+ * Convert a diagnostic returned by the kernel into a dladm_status_t.
  */
 static dladm_status_t
-i_dladm_vnic_create_sys(int fd, dladm_vnic_attr_db_t *attr)
+dladm_vnic_diag2status(vnic_ioc_diag_t ioc_diag)
 {
-	vnic_ioc_create_t ioc;
-
-	ioc.vc_vnic_id = attr->vt_vnic_id;
-	ioc.vc_link_id = attr->vt_link_id;
-	ioc.vc_mac_addr_type = attr->vt_mac_addr_type;
-	ioc.vc_mac_len = attr->vt_mac_len;
-	bcopy(attr->vt_mac_addr, ioc.vc_mac_addr, attr->vt_mac_len);
-
-	if (ioctl(fd, VNIC_IOC_CREATE, &ioc) < 0)
-		return (dladm_errno2status(errno));
-
+	switch (ioc_diag) {
+	case VNIC_IOC_DIAG_MACADDR_INVALID:
+		return (DLADM_STATUS_INVALIDMACADDR);
+	case VNIC_IOC_DIAG_MACADDRLEN_INVALID:
+		return (DLADM_STATUS_INVALIDMACADDRLEN);
+	case VNIC_IOC_DIAG_MACADDR_NIC:
+		return (DLADM_STATUS_INVALIDMACADDRNIC);
+	case VNIC_IOC_DIAG_MACADDR_INUSE:
+		return (DLADM_STATUS_INVALIDMACADDRINUSE);
+	case VNIC_IOC_DIAG_MACFACTORYSLOTINVALID:
+		return (DLADM_STATUS_MACFACTORYSLOTINVALID);
+	case VNIC_IOC_DIAG_MACFACTORYSLOTUSED:
+		return (DLADM_STATUS_MACFACTORYSLOTUSED);
+	case VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED:
+		return (DLADM_STATUS_MACFACTORYSLOTALLUSED);
+	case VNIC_IOC_DIAG_MACFACTORYNOTSUP:
+		return (DLADM_STATUS_MACFACTORYNOTSUP);
+	case VNIC_IOC_DIAG_MACPREFIX_INVALID:
+		return (DLADM_STATUS_INVALIDMACPREFIX);
+	case VNIC_IOC_DIAG_MACPREFIXLEN_INVALID:
+		return (DLADM_STATUS_INVALIDMACPREFIXLEN);
+	case VNIC_IOC_DIAG_MACMARGIN_INVALID:
+		return (DLADM_STATUS_INVALID_MACMARGIN);
+	case VNIC_IOC_DIAG_NO_HWRINGS:
+		return (DLADM_STATUS_NO_HWRINGS);
+	}
 	return (DLADM_STATUS_OK);
 }
 
 /*
- * Send a modify command to the VNIC driver.
+ * Send a create command to the VNIC driver.
  */
-static dladm_status_t
-i_dladm_vnic_modify_sys(datalink_id_t vnic_id, uint32_t modify_mask,
-    dladm_vnic_modify_attr_t *attr)
+dladm_status_t
+i_dladm_vnic_create_sys(dladm_vnic_attr_t *attr)
 {
+	int rc, fd;
+	vnic_ioc_create_t ioc;
 	dladm_status_t status = DLADM_STATUS_OK;
-	int fd;
-	vnic_ioc_modify_t ioc;
-
-	ioc.vm_vnic_id = vnic_id;
 
-	ioc.vm_modify_mask = 0;
-	if (modify_mask & DLADM_VNIC_MODIFY_ADDR)
-		ioc.vm_modify_mask |= VNIC_IOC_MODIFY_ADDR;
-
-	ioc.vm_mac_addr_type = attr->vm_mac_addr_type;
-	ioc.vm_mac_len = attr->vm_mac_len;
-	bcopy(attr->vm_mac_addr, ioc.vm_mac_addr, MAXMACADDRLEN);
+	bzero(&ioc, sizeof (ioc));
+	ioc.vc_vnic_id = attr->va_vnic_id;
+	ioc.vc_link_id = attr->va_link_id;
+	ioc.vc_mac_addr_type = attr->va_mac_addr_type;
+	ioc.vc_mac_len = attr->va_mac_len;
+	ioc.vc_mac_slot = attr->va_mac_slot;
+	ioc.vc_mac_prefix_len = attr->va_mac_prefix_len;
+	ioc.vc_vid = attr->va_vid;
+	ioc.vc_flags = attr->va_force ? VNIC_IOC_CREATE_FORCE : 0;
+	ioc.vc_flags |= attr->va_hwrings ? VNIC_IOC_CREATE_REQ_HWRINGS : 0;
+
+	if (attr->va_mac_len > 0 || ioc.vc_mac_prefix_len > 0)
+		bcopy(attr->va_mac_addr, ioc.vc_mac_addr, MAXMACADDRLEN);
+	bcopy(&attr->va_resource_props, &ioc.vc_resource_props,
+	    sizeof (mac_resource_props_t));
+	if (attr->va_link_id == DATALINK_INVALID_LINKID)
+		ioc.vc_flags |= VNIC_IOC_CREATE_ANCHOR;
 
 	if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
 		return (dladm_errno2status(errno));
 
-	if (ioctl(fd, VNIC_IOC_MODIFY, &ioc) < 0)
+	rc = ioctl(fd, VNIC_IOC_CREATE, &ioc);
+	if (rc < 0)
 		status = dladm_errno2status(errno);
 
 	(void) close(fd);
+	if (status != DLADM_STATUS_OK) {
+		if (ioc.vc_diag != VNIC_IOC_DIAG_NONE)
+			status = dladm_vnic_diag2status(ioc.vc_diag);
+	}
+	if (status != DLADM_STATUS_OK)
+		return (status);
+
+	attr->va_mac_addr_type = ioc.vc_mac_addr_type;
+	switch (ioc.vc_mac_addr_type) {
+	case VNIC_MAC_ADDR_TYPE_FACTORY:
+		attr->va_mac_slot = ioc.vc_mac_slot;
+		break;
+	case VNIC_MAC_ADDR_TYPE_RANDOM:
+		bcopy(ioc.vc_mac_addr, attr->va_mac_addr, MAXMACADDRLEN);
+		attr->va_mac_len = ioc.vc_mac_len;
+		break;
+	}
 	return (status);
 }
 
 /*
  * Get the configuration information of the given VNIC.
  */
-dladm_status_t
-dladm_vnic_info(datalink_id_t vnic_id, dladm_vnic_attr_sys_t *attrp,
-    uint32_t flags)
+static dladm_status_t
+i_dladm_vnic_info_active(datalink_id_t linkid, dladm_vnic_attr_t *attrp)
 {
-	vnic_ioc_info_t *ioc;
-	vnic_ioc_info_vnic_t *vnic;
-	int bufsize, fd;
+	vnic_ioc_info_t ioc;
+	vnic_info_t *vnic;
+	int rc, fd;
 	dladm_status_t status = DLADM_STATUS_OK;
 
-	/* for now, only temporary creations are supported */
-	if (flags & DLADM_OPT_PERSIST)
-		return (dladm_errno2status(ENOTSUP));
-
 	if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) == -1)
 		return (dladm_errno2status(errno));
 
-	bufsize = sizeof (vnic_ioc_info_t) + sizeof (vnic_ioc_info_vnic_t);
-	ioc = (vnic_ioc_info_t *)calloc(1, bufsize);
-	if (ioc == NULL) {
-		(void) close(fd);
-		return (dladm_errno2status(ENOMEM));
-	}
+	bzero(&ioc, sizeof (ioc));
+	vnic = &ioc.vi_info;
+	vnic->vn_vnic_id = linkid;
 
-	ioc->vi_vnic_id = vnic_id;
-	ioc->vi_size = bufsize - sizeof (vnic_ioc_info_t);
-	if (ioctl(fd, VNIC_IOC_INFO, ioc) != 0) {
+	rc = ioctl(fd, VNIC_IOC_INFO, &ioc);
+	if (rc != 0) {
 		status = dladm_errno2status(errno);
 		goto bail;
 	}
 
-	vnic = (vnic_ioc_info_vnic_t *)(ioc + 1);
-
 	attrp->va_vnic_id = vnic->vn_vnic_id;
 	attrp->va_link_id = vnic->vn_link_id;
 	attrp->va_mac_addr_type = vnic->vn_mac_addr_type;
-	bcopy(vnic->vn_mac_addr, attrp->va_mac_addr, ETHERADDRL);
+	bcopy(vnic->vn_mac_addr, attrp->va_mac_addr, MAXMACADDRLEN);
 	attrp->va_mac_len = vnic->vn_mac_len;
+	attrp->va_mac_slot = vnic->vn_mac_slot;
+	attrp->va_mac_prefix_len = vnic->vn_mac_prefix_len;
+	attrp->va_vid = vnic->vn_vid;
+	attrp->va_force = vnic->vn_force;
 
 bail:
-	free(ioc);
 	(void) close(fd);
 	return (status);
 }
 
+static dladm_status_t
+i_dladm_vnic_info_persist(datalink_id_t linkid, dladm_vnic_attr_t *attrp)
+{
+	dladm_conf_t conf;
+	dladm_status_t status;
+	char macstr[ETHERADDRL * 3];
+	uint64_t u64;
+	datalink_class_t class;
+
+	attrp->va_vnic_id = linkid;
+	if ((status = dladm_read_conf(linkid, &conf)) != DLADM_STATUS_OK)
+		return (status);
+
+	status = dladm_get_conf_field(conf, FLINKOVER, &u64, sizeof (u64));
+	attrp->va_link_id = ((status == DLADM_STATUS_OK) ?
+	    (datalink_id_t)u64 : DATALINK_INVALID_LINKID);
+
+	status = dladm_get_conf_field(conf, FHWRINGS, &attrp->va_hwrings,
+	    sizeof (boolean_t));
+
+	if (status != DLADM_STATUS_OK && status != DLADM_STATUS_NOTFOUND)
+		goto done;
+	if (status == DLADM_STATUS_NOTFOUND)
+		attrp->va_hwrings = B_FALSE;
+
+	if ((status = dladm_datalink_id2info(linkid, NULL, &class,
+	    NULL, NULL, 0)) != DLADM_STATUS_OK)
+		goto done;
+
+	if (class == DATALINK_CLASS_VLAN) {
+		if (attrp->va_link_id == DATALINK_INVALID_LINKID) {
+			status = DLADM_STATUS_BADARG;
+			goto done;
+		}
+		attrp->va_mac_addr_type = VNIC_MAC_ADDR_TYPE_PRIMARY;
+		attrp->va_mac_len = 0;
+	} else {
+		status = dladm_get_conf_field(conf, FMADDRTYPE, &u64,
+		    sizeof (u64));
+		if (status != DLADM_STATUS_OK)
+			goto done;
+
+		attrp->va_mac_addr_type = (vnic_mac_addr_type_t)u64;
+
+		status = dladm_get_conf_field(conf, FMADDRLEN, &u64,
+		    sizeof (u64));
+		attrp->va_mac_len = ((status == DLADM_STATUS_OK) ?
+		    (uint_t)u64 : ETHERADDRL);
+
+		status = dladm_get_conf_field(conf, FMADDRSLOT, &u64,
+		    sizeof (u64));
+		attrp->va_mac_slot = ((status == DLADM_STATUS_OK) ?
+		    (int)u64 : -1);
+
+		status = dladm_get_conf_field(conf, FMADDRPREFIXLEN, &u64,
+		    sizeof (u64));
+		attrp->va_mac_prefix_len = ((status == DLADM_STATUS_OK) ?
+		    (uint_t)u64 : sizeof (dladm_vnic_def_prefix));
+
+		status = dladm_get_conf_field(conf, FMACADDR, macstr,
+		    sizeof (macstr));
+		if (status != DLADM_STATUS_OK)
+			goto done;
+
+		status = dladm_vnic_str2macaddr(macstr, attrp->va_mac_addr);
+		if (status != DLADM_STATUS_OK)
+			goto done;
+	}
+
+	status = dladm_get_conf_field(conf, FVLANID, &u64, sizeof (u64));
+	attrp->va_vid = ((status == DLADM_STATUS_OK) ?  (uint16_t)u64 : 0);
+
+
+	status = DLADM_STATUS_OK;
+done:
+	dladm_destroy_conf(conf);
+	return (status);
+}
+
+dladm_status_t
+dladm_vnic_info(datalink_id_t linkid, dladm_vnic_attr_t *attrp,
+    uint32_t flags)
+{
+	if (flags == DLADM_OPT_ACTIVE)
+		return (i_dladm_vnic_info_active(linkid, attrp));
+	else if (flags == DLADM_OPT_PERSIST)
+		return (i_dladm_vnic_info_persist(linkid, attrp));
+	else
+		return (DLADM_STATUS_BADARG);
+}
+
 /*
  * Remove a VNIC from the kernel.
  */
-static dladm_status_t
-i_dladm_vnic_delete_sys(int fd, dladm_vnic_attr_sys_t *attr)
+dladm_status_t
+i_dladm_vnic_delete_sys(datalink_id_t linkid)
 {
 	vnic_ioc_delete_t ioc;
+	dladm_status_t status = DLADM_STATUS_OK;
+	int rc, fd;
 
-	ioc.vd_vnic_id = attr->va_vnic_id;
-
-	if (ioctl(fd, VNIC_IOC_DELETE, &ioc) < 0)
+	if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
 		return (dladm_errno2status(errno));
 
-	return (DLADM_STATUS_OK);
+	ioc.vd_vnic_id = linkid;
+
+	rc = ioctl(fd, VNIC_IOC_DELETE, &ioc);
+	if (rc < 0)
+		status = dladm_errno2status(errno);
+
+	(void) close(fd);
+	return (status);
 }
 
 /*
@@ -182,20 +304,32 @@ i_dladm_vnic_delete_sys(int fd, dladm_vnic_attr_sys_t *attr)
  */
 
 typedef struct dladm_vnic_addr_type_s {
-	char *va_str;
-	vnic_mac_addr_type_t va_type;
+	const char		*va_str;
+	vnic_mac_addr_type_t	va_type;
 } dladm_vnic_addr_type_t;
 
 static dladm_vnic_addr_type_t addr_types[] = {
 	{"fixed", VNIC_MAC_ADDR_TYPE_FIXED},
+	{"random", VNIC_MAC_ADDR_TYPE_RANDOM},
+	{"factory", VNIC_MAC_ADDR_TYPE_FACTORY},
+	{"auto", VNIC_MAC_ADDR_TYPE_AUTO},
+	{"fixed", VNIC_MAC_ADDR_TYPE_PRIMARY}
 };
 
 #define	NADDR_TYPES (sizeof (addr_types) / sizeof (dladm_vnic_addr_type_t))
 
-/*
- * Return DLADM_STATUS_OK if a matching type was found,
- * DLADM_STATUS_BADARG otherwise
- */
+static const char *
+dladm_vnic_macaddrtype2str(vnic_mac_addr_type_t type)
+{
+	int i;
+
+	for (i = 0; i < NADDR_TYPES; i++) {
+		if (type == addr_types[i].va_type)
+			return (addr_types[i].va_str);
+	}
+	return (NULL);
+}
+
 dladm_status_t
 dladm_vnic_str2macaddrtype(const char *str, vnic_mac_addr_type_t *val)
 {
@@ -209,136 +343,397 @@ dladm_vnic_str2macaddrtype(const char *str, vnic_mac_addr_type_t *val)
 			return (DLADM_STATUS_OK);
 		}
 	}
-
 	return (DLADM_STATUS_BADARG);
 }
 
+
+
 /*
- * Create a new VNIC. Update the configuration file and bring it up.
+ * Create a new VNIC / VLAN. Update the configuration file and bring it up.
  */
 dladm_status_t
 dladm_vnic_create(const char *vnic, datalink_id_t linkid,
     vnic_mac_addr_type_t mac_addr_type, uchar_t *mac_addr, int mac_len,
-    datalink_id_t *vnic_id_out, uint32_t flags)
+    int *mac_slot, uint_t mac_prefix_len, uint16_t vid,
+    datalink_id_t *vnic_id_out, dladm_arg_list_t *proplist, uint32_t flags)
 {
-	dladm_vnic_attr_db_t attr;
-	int i, fd;
+	dladm_vnic_attr_t attr;
 	datalink_id_t vnic_id;
 	datalink_class_t class;
-	uint32_t media;
-	char *name = (char *)vnic;
+	uint32_t media = DL_ETHER;
+	char name[MAXLINKNAMELEN];
+	uchar_t tmp_addr[MAXMACADDRLEN];
 	dladm_status_t status;
+	boolean_t is_vlan;
+	boolean_t is_etherstub;
+	int i;
 
 	/*
 	 * Sanity test arguments.
 	 */
-	if (flags & DLADM_OPT_PERSIST)
-		return (dladm_errno2status(ENOTSUP));
+	if ((flags & DLADM_OPT_ACTIVE) == 0)
+		return (DLADM_STATUS_NOTSUP);
+
+	is_vlan = ((flags & DLADM_OPT_VLAN) != 0);
+	if (is_vlan && ((vid < 1 || vid > 4094)))
+		return (DLADM_STATUS_VIDINVAL);
+
+	is_etherstub = (linkid == DATALINK_INVALID_LINKID);
 
 	if (mac_len > MAXMACADDRLEN)
 		return (DLADM_STATUS_INVALIDMACADDRLEN);
 
-	for (i = 0; i < NADDR_TYPES; i++) {
-		if (mac_addr_type == addr_types[i].va_type)
-			break;
-	}
-	if (i == NADDR_TYPES)
+	if (!dladm_vnic_macaddrtype2str(mac_addr_type))
 		return (DLADM_STATUS_INVALIDMACADDRTYPE);
 
-	if ((status = dladm_datalink_id2info(linkid, NULL, &class, &media,
-	    NULL, 0)) != DLADM_STATUS_OK) {
-		return (status);
+	/*
+	 * If a random address might be generated, but no prefix
+	 * was specified by the caller, use the default MAC address
+	 * prefix.
+	 */
+	if ((mac_addr_type == VNIC_MAC_ADDR_TYPE_RANDOM ||
+	    mac_addr_type == VNIC_MAC_ADDR_TYPE_AUTO) &&
+	    mac_prefix_len == 0) {
+		mac_prefix_len = sizeof (dladm_vnic_def_prefix);
+		mac_addr = tmp_addr;
+		bcopy(dladm_vnic_def_prefix, mac_addr, mac_prefix_len);
 	}
 
-	if (class == DATALINK_CLASS_VNIC)
-		return (DLADM_STATUS_BADARG);
+	if ((flags & DLADM_OPT_ANCHOR) == 0) {
+		if ((status = dladm_datalink_id2info(linkid, NULL, &class,
+		    &media, NULL, 0)) != DLADM_STATUS_OK)
+			return (status);
+
+		if (class == DATALINK_CLASS_VNIC ||
+		    class == DATALINK_CLASS_VLAN)
+			return (DLADM_STATUS_BADARG);
+	} else {
+		/* it's an anchor VNIC */
+		if (linkid != DATALINK_INVALID_LINKID || vid != 0)
+			return (DLADM_STATUS_BADARG);
+	}
 
 	if (vnic == NULL) {
 		flags |= DLADM_OPT_PREFIX;
-		name = "vnic";
+		(void) strlcpy(name, "vnic", sizeof (name));
+	} else {
+		(void) strlcpy(name, vnic, sizeof (name));
 	}
 
-	if ((status = dladm_create_datalink_id(name, DATALINK_CLASS_VNIC,
-	    media, flags, &vnic_id)) != DLADM_STATUS_OK) {
+	class = is_vlan ? DATALINK_CLASS_VLAN :
+	    (is_etherstub ? DATALINK_CLASS_ETHERSTUB : DATALINK_CLASS_VNIC);
+	if ((status = dladm_create_datalink_id(name, class,
+	    media, flags, &vnic_id)) != DLADM_STATUS_OK)
 		return (status);
+
+	if ((flags & DLADM_OPT_PREFIX) != 0) {
+		(void) snprintf(name + 4, sizeof (name), "%llu", vnic_id);
+		flags &= ~DLADM_OPT_PREFIX;
 	}
 
 	bzero(&attr, sizeof (attr));
-	attr.vt_vnic_id = vnic_id;
-	attr.vt_link_id = linkid;
-	attr.vt_mac_addr_type = mac_addr_type;
-	attr.vt_mac_len = mac_len;
-	bcopy(mac_addr, attr.vt_mac_addr, mac_len);
 
-	if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) {
-		status = dladm_errno2status(errno);
+	/* Extract resource_ctl and cpu_list from proplist */
+	if (proplist != NULL) {
+		status = dladm_link_proplist_extract(proplist,
+		    &attr.va_resource_props);
+		if (status != DLADM_STATUS_OK)
+			goto done;
+	}
+
+	attr.va_vnic_id = vnic_id;
+	attr.va_link_id = linkid;
+	attr.va_mac_addr_type = mac_addr_type;
+	attr.va_mac_len = mac_len;
+	if (mac_slot != NULL)
+		attr.va_mac_slot = *mac_slot;
+	if (mac_len > 0)
+		bcopy(mac_addr, attr.va_mac_addr, mac_len);
+	else if (mac_prefix_len > 0)
+		bcopy(mac_addr, attr.va_mac_addr, mac_prefix_len);
+	attr.va_mac_prefix_len = mac_prefix_len;
+	attr.va_vid = vid;
+	attr.va_force = (flags & DLADM_OPT_FORCE) != 0;
+	attr.va_hwrings = (flags & DLADM_OPT_HWRINGS) != 0;
+
+	status = i_dladm_vnic_create_sys(&attr);
+	if (status != DLADM_STATUS_OK)
+		goto done;
+
+	/* Save vnic configuration and its properties */
+	if (!(flags & DLADM_OPT_PERSIST))
+		goto done;
+
+	status = dladm_vnic_persist_conf(name, &attr, class);
+	if (status != DLADM_STATUS_OK) {
+		(void) i_dladm_vnic_delete_sys(vnic_id);
 		goto done;
 	}
 
-	status = i_dladm_vnic_create_sys(fd, &attr);
-	(void) close(fd);
+	if (proplist != NULL) {
+		for (i = 0; i < proplist->al_count; i++) {
+			dladm_arg_info_t	*aip = &proplist->al_info[i];
+
+			status = dladm_set_linkprop(vnic_id, aip->ai_name,
+			    aip->ai_val, aip->ai_count, DLADM_OPT_PERSIST);
+			if (status != DLADM_STATUS_OK)
+				break;
+		}
+
+		if (status != DLADM_STATUS_OK) {
+			(void) dladm_remove_conf(vnic_id);
+			(void) i_dladm_vnic_delete_sys(vnic_id);
+		}
+	}
 
 done:
 	if (status != DLADM_STATUS_OK) {
-		(void) dladm_destroy_datalink_id(vnic_id,
-		    flags & ~DLADM_OPT_PREFIX);
+		(void) dladm_destroy_datalink_id(vnic_id, flags);
 	} else {
-		*vnic_id_out = vnic_id;
+		if (vnic_id_out != NULL)
+			*vnic_id_out = vnic_id;
+		if (mac_slot != NULL)
+			*mac_slot = attr.va_mac_slot;
 	}
-
 	return (status);
 }
 
 /*
- * Modify the properties of a VNIC.
+ * Delete a VNIC / VLAN.
  */
 dladm_status_t
-dladm_vnic_modify(datalink_id_t vnic_id, uint32_t modify_mask,
-    vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr,
-    uint32_t flags)
+dladm_vnic_delete(datalink_id_t linkid, uint32_t flags)
 {
-	dladm_vnic_modify_attr_t new_attr;
+	dladm_status_t status;
+	datalink_class_t class;
 
-	/* for now, only temporary creations are supported */
-	if (flags & DLADM_OPT_PERSIST)
-		return (dladm_errno2status(ENOTSUP));
+	if (flags == 0)
+		return (DLADM_STATUS_BADARG);
 
-	bzero(&new_attr, sizeof (new_attr));
+	if ((dladm_datalink_id2info(linkid, NULL, &class, NULL, NULL, 0) !=
+	    DLADM_STATUS_OK))
+		return (DLADM_STATUS_BADARG);
 
-	if (modify_mask & DLADM_VNIC_MODIFY_ADDR) {
-		new_attr.vm_mac_addr_type = mac_addr_type;
-		new_attr.vm_mac_len = mac_len;
-		bcopy(mac_addr, new_attr.vm_mac_addr, MAXMACADDRLEN);
+	if ((flags & DLADM_OPT_VLAN) != 0) {
+		if (class != DATALINK_CLASS_VLAN)
+			return (DLADM_STATUS_BADARG);
+	} else {
+		if (class != DATALINK_CLASS_VNIC &&
+		    class != DATALINK_CLASS_ETHERSTUB)
+			return (DLADM_STATUS_BADARG);
 	}
 
-	/* update the properties of the existing VNIC */
-	return (i_dladm_vnic_modify_sys(vnic_id, modify_mask, &new_attr));
+	if ((flags & DLADM_OPT_ACTIVE) != 0) {
+		status = i_dladm_vnic_delete_sys(linkid);
+		if (status == DLADM_STATUS_OK) {
+			(void) dladm_set_linkprop(linkid, NULL, NULL, 0,
+			    DLADM_OPT_ACTIVE);
+			(void) dladm_destroy_datalink_id(linkid,
+			    DLADM_OPT_ACTIVE);
+		} else if (status != DLADM_STATUS_NOTFOUND ||
+		    !(flags & DLADM_OPT_PERSIST)) {
+			return (status);
+		}
+	}
+	if ((flags & DLADM_OPT_PERSIST) != 0) {
+		(void) dladm_destroy_datalink_id(linkid, DLADM_OPT_PERSIST);
+		(void) dladm_remove_conf(linkid);
+	}
+	return (DLADM_STATUS_OK);
 }
 
-/*
- * Delete a VNIC.
- */
-dladm_status_t
-dladm_vnic_delete(datalink_id_t vnic_id, uint32_t flags)
+static const char *
+dladm_vnic_macaddr2str(const uchar_t *mac, char *buf)
 {
-	dladm_status_t status;
-	dladm_vnic_attr_sys_t sys_attr;
-	int fd;
+	static char unknown_mac[] = {0, 0, 0, 0, 0, 0};
 
-	/* for now, only temporary deletes are supported */
-	if (flags & DLADM_OPT_PERSIST)
-		return (dladm_errno2status(ENOTSUP));
+	if (buf == NULL)
+		return (NULL);
 
-	if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0)
-		return (dladm_errno2status(errno));
+	if (bcmp(unknown_mac, mac, ETHERADDRL) == 0)
+		(void) strlcpy(buf, "unknown", DLADM_STRSIZE);
+	else
+		return (_link_ntoa(mac, buf, ETHERADDRL, IFT_OTHER));
 
-	sys_attr.va_vnic_id = vnic_id;
-	status = i_dladm_vnic_delete_sys(fd, &sys_attr);
-	(void) close(fd);
+	return (buf);
+}
 
-	if (status != DLADM_STATUS_OK)
+static dladm_status_t
+dladm_vnic_str2macaddr(const char *str, uchar_t *buf)
+{
+	int len = 0;
+	uchar_t *b = _link_aton(str, &len);
+
+	if (b == NULL || len >= MAXMACADDRLEN)
+		return (DLADM_STATUS_BADARG);
+
+	bcopy(b, buf, len);
+	free(b);
+	return (DLADM_STATUS_OK);
+}
+
+
+static dladm_status_t
+dladm_vnic_persist_conf(const char *name, dladm_vnic_attr_t *attrp,
+    datalink_class_t class)
+{
+	dladm_conf_t conf = DLADM_INVALID_CONF;
+	dladm_status_t status;
+	char macstr[ETHERADDRL * 3];
+	uint64_t u64;
+
+	if ((status = dladm_create_conf(name, attrp->va_vnic_id,
+	    class, DL_ETHER, &conf)) != DLADM_STATUS_OK)
 		return (status);
 
-	(void) dladm_destroy_datalink_id(vnic_id, flags);
+	if (attrp->va_link_id != DATALINK_INVALID_LINKID) {
+		u64 = attrp->va_link_id;
+		status = dladm_set_conf_field(conf, FLINKOVER,
+		    DLADM_TYPE_UINT64, &u64);
+		if (status != DLADM_STATUS_OK)
+			goto done;
+	}
+
+	if (class != DATALINK_CLASS_VLAN) {
+		u64 = attrp->va_mac_addr_type;
+		status = dladm_set_conf_field(conf, FMADDRTYPE,
+		    DLADM_TYPE_UINT64, &u64);
+		if (status != DLADM_STATUS_OK)
+			goto done;
+
+		if (attrp->va_mac_len != ETHERADDRL) {
+			u64 = attrp->va_mac_len;
+			status = dladm_set_conf_field(conf, FMADDRLEN,
+			    DLADM_TYPE_UINT64, &u64);
+			if (status != DLADM_STATUS_OK)
+				goto done;
+		}
+	}
+
+	if (attrp->va_hwrings) {
+		boolean_t hwrings = attrp->va_hwrings;
+		status = dladm_set_conf_field(conf, FHWRINGS,
+		    DLADM_TYPE_BOOLEAN, &hwrings);
+		if (status != DLADM_STATUS_OK)
+			goto done;
+	}
+
+	if (class != DATALINK_CLASS_VLAN) {
+		if (attrp->va_mac_slot != -1) {
+			u64 = attrp->va_mac_slot;
+			status = dladm_set_conf_field(conf, FMADDRSLOT,
+			    DLADM_TYPE_UINT64, &u64);
+			if (status != DLADM_STATUS_OK)
+			goto done;
+		}
+
+		if (attrp->va_mac_prefix_len !=
+		    sizeof (dladm_vnic_def_prefix)) {
+			u64 = attrp->va_mac_prefix_len;
+			status = dladm_set_conf_field(conf, FMADDRPREFIXLEN,
+			    DLADM_TYPE_UINT64, &u64);
+			if (status != DLADM_STATUS_OK)
+				goto done;
+		}
+
+		(void) dladm_vnic_macaddr2str(attrp->va_mac_addr, macstr);
+		status = dladm_set_conf_field(conf, FMACADDR, DLADM_TYPE_STR,
+		    macstr);
+		if (status != DLADM_STATUS_OK)
+			goto done;
+	}
+
+	if (attrp->va_vid != 0) {
+		u64 = attrp->va_vid;
+		status = dladm_set_conf_field(conf, FVLANID,
+		    DLADM_TYPE_UINT64, &u64);
+		if (status != DLADM_STATUS_OK)
+			goto done;
+	}
+
+	/*
+	 * Commit the link configuration.
+	 */
+	status = dladm_write_conf(conf);
+
+done:
+	dladm_destroy_conf(conf);
 	return (status);
 }
+
+typedef struct dladm_vnic_up_arg_s {
+	uint32_t	flags;
+	dladm_status_t	status;
+} dladm_vnic_up_arg_t;
+
+#define	DLADM_VNIC_UP_FIRST_WALK	0x1
+#define	DLADM_VNIC_UP_SECOND_WALK	0x2
+
+static int
+i_dladm_vnic_up(datalink_id_t linkid, void *arg)
+{
+	dladm_status_t *statusp = &(((dladm_vnic_up_arg_t *)arg)->status);
+	dladm_vnic_attr_t attr;
+	dladm_status_t status;
+	dladm_arg_list_t *proplist;
+	uint32_t flags = ((dladm_vnic_up_arg_t *)arg)->flags;
+
+	bzero(&attr, sizeof (attr));
+
+	status = dladm_vnic_info(linkid, &attr, DLADM_OPT_PERSIST);
+	if (status != DLADM_STATUS_OK)
+		goto done;
+
+	/*
+	 * Create the vnics that request hardware group first
+	 * Create the vnics that don't request hardware group in the second walk
+	 */
+	if ((flags == DLADM_VNIC_UP_FIRST_WALK && !attr.va_hwrings) ||
+	    (flags == DLADM_VNIC_UP_SECOND_WALK && attr.va_hwrings))
+			goto done;
+
+	/* Get all properties for this vnic */
+	status = dladm_link_get_proplist(linkid, &proplist);
+	if (status != DLADM_STATUS_OK)
+		goto done;
+
+	if (proplist != NULL) {
+		status = dladm_link_proplist_extract(proplist,
+		    &attr.va_resource_props);
+	}
+
+	status = i_dladm_vnic_create_sys(&attr);
+	if (status != DLADM_STATUS_OK)
+		goto done;
+
+	if ((status = dladm_up_datalink_id(linkid)) != DLADM_STATUS_OK) {
+		(void) i_dladm_vnic_delete_sys(linkid);
+		goto done;
+	}
+done:
+	*statusp = status;
+	return (DLADM_WALK_CONTINUE);
+}
+
+dladm_status_t
+dladm_vnic_up(datalink_id_t linkid, uint32_t flags)
+{
+	dladm_vnic_up_arg_t vnic_arg;
+	datalink_class_t class;
+
+	class = ((flags & DLADM_OPT_VLAN) != 0) ? DATALINK_CLASS_VLAN :
+	    (DATALINK_CLASS_VNIC | DATALINK_CLASS_ETHERSTUB);
+
+	if (linkid == DATALINK_ALL_LINKID) {
+		vnic_arg.flags = DLADM_VNIC_UP_FIRST_WALK;
+		(void) dladm_walk_datalink_id(i_dladm_vnic_up, &vnic_arg,
+		    class, DATALINK_ANY_MEDIATYPE, DLADM_OPT_PERSIST);
+		vnic_arg.flags = DLADM_VNIC_UP_SECOND_WALK;
+		(void) dladm_walk_datalink_id(i_dladm_vnic_up, &vnic_arg,
+		    class, DATALINK_ANY_MEDIATYPE, DLADM_OPT_PERSIST);
+		return (DLADM_STATUS_OK);
+	} else {
+		(void) i_dladm_vnic_up(linkid, &vnic_arg);
+		return (vnic_arg.status);
+	}
+}
diff --git a/usr/src/lib/libdladm/common/libdlvnic.h b/usr/src/lib/libdladm/common/libdlvnic.h
index 79b4b01ba2..77f78130be 100644
--- a/usr/src/lib/libdladm/common/libdlvnic.h
+++ b/usr/src/lib/libdladm/common/libdlvnic.h
@@ -26,39 +26,43 @@
 #ifndef _LIBDLVNIC_H
 #define	_LIBDLVNIC_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <netinet/in.h>
 #include <libdladm.h>
+#include <libdladm_impl.h>
+#include <sys/mac_flow.h>
 #include <sys/vnic.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
-typedef struct dladm_vnic_attr_sys {
+typedef struct dladm_vnic_attr {
 	datalink_id_t		va_vnic_id;
 	datalink_id_t		va_link_id;
 	vnic_mac_addr_type_t	va_mac_addr_type;
-	uchar_t			va_mac_addr[ETHERADDRL];
 	uint_t			va_mac_len;
-} dladm_vnic_attr_sys_t;
+	uchar_t			va_mac_addr[MAXMACADDRLEN];
+	int			va_mac_slot;
+	uint_t			va_mac_prefix_len;
+	uint16_t		va_vid;
+	boolean_t		va_force;
+	boolean_t		va_hwrings;
+	mac_resource_props_t	va_resource_props;
+} dladm_vnic_attr_t;
 
-/*
- * Modification flags for dladm_vnic_modify().
- */
-#define	DLADM_VNIC_MODIFY_ADDR		0x01
+extern dladm_status_t	dladm_vnic_create(const char *, datalink_id_t,
+			    vnic_mac_addr_type_t, uchar_t *, int, int *,
+			    uint_t, uint16_t, datalink_id_t *,
+			    dladm_arg_list_t *, uint32_t);
+
+extern dladm_status_t	dladm_vnic_delete(datalink_id_t, uint32_t);
+extern dladm_status_t	dladm_vnic_info(datalink_id_t, dladm_vnic_attr_t *,
+			    uint32_t);
 
-extern dladm_status_t dladm_vnic_create(const char *, datalink_id_t,
-    vnic_mac_addr_type_t, uchar_t *, int, uint_t *, uint32_t);
-extern dladm_status_t dladm_vnic_modify(datalink_id_t, uint32_t,
-    vnic_mac_addr_type_t, uint_t, uchar_t *, uint32_t);
-extern dladm_status_t dladm_vnic_delete(datalink_id_t, uint32_t);
-extern dladm_status_t dladm_vnic_info(datalink_id_t, dladm_vnic_attr_sys_t *,
-    uint32_t);
-extern dladm_status_t dladm_vnic_str2macaddrtype(const char *,
-    vnic_mac_addr_type_t *);
+extern dladm_status_t	dladm_vnic_up(datalink_id_t, uint32_t);
+extern dladm_status_t	dladm_vnic_str2macaddrtype(const char *,
+			    vnic_mac_addr_type_t *);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/lib/libdladm/common/linkprop.c b/usr/src/lib/libdladm/common/linkprop.c
index 8a570c70ef..2d58b585f8 100644
--- a/usr/src/lib/libdladm/common/linkprop.c
+++ b/usr/src/lib/libdladm/common/linkprop.c
@@ -41,30 +41,34 @@
 #include <libdlwlan_impl.h>
 #include <libdlwlan.h>
 #include <libdlvlan.h>
+#include <libdlvnic.h>
+#include <libintl.h>
 #include <dlfcn.h>
 #include <link.h>
 #include <inet/wifi_ioctl.h>
 #include <libdladm.h>
+#include <libdlstat.h>
 #include <sys/param.h>
+#include <sys/debug.h>
+#include <sys/dld.h>
+#include <sys/mac_flow.h>
 #include <inttypes.h>
 #include <sys/ethernet.h>
 #include <net/wpa.h>
 #include <sys/sysmacros.h>
 
-#define	PERM_READ_ONLY 	"r-"
-#define	PERM_READ_WRITE	"rw"
-
 /*
  * The linkprop get() callback.
- * - pd: 	pointer to the struct prop_desc
+ * - pd: 	pointer to the prop_desc_t
  * - propstrp:	a property string array to keep the returned property.
  *		Caller allocated.
  * - cntp:	number of returned properties.
  *		Caller also uses it to indicate how many it expects.
  */
 struct prop_desc;
+typedef struct prop_desc prop_desc_t;
 
-typedef dladm_status_t	pd_getf_t(struct prop_desc *pd,
+typedef dladm_status_t	pd_getf_t(prop_desc_t *pdp,
 			datalink_id_t, char **propstp, uint_t *cntp,
 			datalink_media_t, uint_t, uint_t *);
 
@@ -79,10 +83,9 @@ typedef dladm_status_t	pd_getf_t(struct prop_desc *pd,
  * of ioctl buffers etc. pd_set() may call another common routine (used
  * by all other pd_sets) which invokes the ioctl.
  */
-typedef dladm_status_t	pd_setf_t(struct prop_desc *, datalink_id_t,
-			val_desc_t *propval, uint_t cnt, uint_t flags,
-			datalink_media_t);
-
+typedef dladm_status_t	pd_setf_t(prop_desc_t *, datalink_id_t,
+			    val_desc_t *propval, uint_t cnt, uint_t flags,
+			    datalink_media_t);
 
 /*
  * The linkprop check() callback.
@@ -98,9 +101,8 @@ typedef dladm_status_t	pd_setf_t(struct prop_desc *, datalink_id_t,
  * with either a val_desc_t found on the pd_modval list or something
  * generated on the fly.
  */
-typedef dladm_status_t	pd_checkf_t(struct prop_desc *pd,
-			    datalink_id_t, char **propstrp,
-			    uint_t cnt, val_desc_t *propval,
+typedef dladm_status_t	pd_checkf_t(prop_desc_t *pdp, datalink_id_t,
+			    char **propstrp, uint_t cnt, val_desc_t *propval,
 			    datalink_media_t);
 
 typedef struct link_attr_s {
@@ -110,39 +112,45 @@ typedef struct link_attr_s {
 } link_attr_t;
 
 static dld_ioc_macprop_t *i_dladm_buf_alloc_by_name(size_t, datalink_id_t,
-					const char *, uint_t, dladm_status_t *);
+			    const char *, uint_t, dladm_status_t *);
 static dld_ioc_macprop_t *i_dladm_buf_alloc_by_id(size_t, datalink_id_t,
-					mac_prop_id_t, uint_t,
-					dladm_status_t *);
+			    mac_prop_id_t, uint_t, dladm_status_t *);
+static dld_ioc_macprop_t *i_dladm_get_public_prop(datalink_id_t, char *, uint_t,
+			    dladm_status_t *, uint_t *);
+
 static dladm_status_t i_dladm_set_prop(datalink_id_t, const char *, char **,
 					uint_t, uint_t);
 static dladm_status_t i_dladm_get_prop(datalink_id_t, const char *, char **,
 					uint_t *, dladm_prop_type_t, uint_t);
 static link_attr_t *dladm_name2prop(const char *);
 static link_attr_t *dladm_id2prop(mac_prop_id_t);
-static dld_ioc_macprop_t *i_dladm_get_public_prop(datalink_id_t, char *, uint_t,
-					dladm_status_t *);
+
 static pd_getf_t	do_get_zone, do_get_autopush, do_get_rate_mod,
 			do_get_rate_prop, do_get_channel_prop,
 			do_get_powermode_prop, do_get_radio_prop,
 			i_dladm_duplex_get, i_dladm_status_get,
 			i_dladm_binary_get, i_dladm_uint32_get,
-			i_dladm_flowctl_get;
+			i_dladm_flowctl_get, dld_maxbw_get, dld_cpus_get,
+			dld_priority_get;
+
 static pd_setf_t	do_set_zone, do_set_rate_prop,
 			do_set_powermode_prop, do_set_radio_prop,
-			i_dladm_set_public_prop;
+			i_dladm_set_public_prop, do_set_res, do_set_cpus;
+
 static pd_checkf_t	do_check_zone, do_check_autopush, do_check_rate,
-			i_dladm_defmtu_check;
+			i_dladm_defmtu_check, do_check_maxbw, do_check_cpus,
+			do_check_priority;
 
-static dladm_status_t	i_dladm_speed_get(struct prop_desc *, datalink_id_t,
-			char **, uint_t *, uint_t);
+static dladm_status_t	i_dladm_speed_get(prop_desc_t *, datalink_id_t,
+			char **, uint_t *, uint_t, uint_t *);
 static dladm_status_t	i_dladm_wlan_get_legacy_ioctl(datalink_id_t, void *,
 			    uint_t, uint_t);
 static dladm_status_t	i_dladm_wlan_set_legacy_ioctl(datalink_id_t, void *,
 			    uint_t, uint_t);
 static dladm_status_t	i_dladm_macprop(void *, boolean_t);
+static const char	*dladm_perm2str(uint_t, char *);
 
-typedef struct prop_desc {
+struct prop_desc {
 	/*
 	 * link property name
 	 */
@@ -202,7 +210,7 @@ typedef struct prop_desc {
 	 * indicate link media type this property applies to.
 	 */
 	datalink_media_t	pd_dmedia;
-} prop_desc_t;
+};
 
 #define	MAC_PROP_BUFSIZE(v)	sizeof (dld_ioc_macprop_t) + (v) - 1
 
@@ -303,7 +311,14 @@ static link_attr_t link_attr[] = {
 
 	{ MAC_PROP_WL_MLME,	sizeof (wl_mlme_t),	"mlme"},
 
+	{ MAC_PROP_MAXBW,	sizeof (mac_resource_props_t),	"maxbw"},
+
+	{ MAC_PROP_PRIO,	sizeof (mac_resource_props_t),	"priority"},
+
+	{ MAC_PROP_BIND_CPU,	sizeof (mac_resource_props_t),	"cpus"},
+
 	{ MAC_PROP_PRIVATE,	0,			"driver-private"}
+
 };
 
 static  val_desc_t	link_duplex_vals[] = {
@@ -324,8 +339,11 @@ static  val_desc_t	link_flow_vals[] = {
 	{ "rx",		LINK_FLOWCTRL_RX	},
 	{ "bi",		LINK_FLOWCTRL_BI	}
 };
-
-#define	VALCNT(vals)    (sizeof ((vals)) / sizeof (val_desc_t))
+static  val_desc_t	link_priority_vals[] = {
+	{ "low",	MPL_LOW	},
+	{ "medium",	MPL_MEDIUM	},
+	{ "high",	MPL_HIGH	}
+};
 
 static val_desc_t	dladm_wlan_radio_vals[] = {
 	{ "on",		DLADM_WLAN_RADIO_ON	},
@@ -338,8 +356,10 @@ static val_desc_t	dladm_wlan_powermode_vals[] = {
 	{ "max",	DLADM_WLAN_PM_MAX	}
 };
 
-static prop_desc_t	prop_table[] = {
+#define	VALCNT(vals)    (sizeof ((vals)) / sizeof (val_desc_t))
+#define	RESET_VAL	((uintptr_t)-1)
 
+static prop_desc_t	prop_table[] = {
 	{ "channel",	{ NULL, 0 },
 	    NULL, 0, NULL, NULL,
 	    do_get_channel_prop, NULL, 0,
@@ -372,12 +392,12 @@ static prop_desc_t	prop_table[] = {
 	    do_get_zone, do_check_zone, PD_TEMPONLY|PD_CHECK_ALLOC,
 	    DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE },
 
-	{ "duplex", { "", 0 },
+	{ "duplex",	{ "", 0 },
 	    link_duplex_vals, VALCNT(link_duplex_vals),
 	    NULL, NULL, i_dladm_duplex_get, NULL,
 	    0, DATALINK_CLASS_PHYS, DL_ETHER },
 
-	{ "state", { "up", LINK_STATE_UP },
+	{ "state",	{ "up", LINK_STATE_UP },
 	    link_status_vals, VALCNT(link_status_vals),
 	    NULL, NULL, i_dladm_status_get, NULL,
 	    0, DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE },
@@ -455,12 +475,34 @@ static prop_desc_t	prop_table[] = {
 	{ "en_10hdx_cap", { "", 0 },
 	    link_01_vals, VALCNT(link_01_vals),
 	    i_dladm_set_public_prop, NULL, i_dladm_binary_get, NULL,
-	    0, DATALINK_CLASS_PHYS, DL_ETHER }
+	    0, DATALINK_CLASS_PHYS, DL_ETHER },
+
+	{ "maxbw", { "--", RESET_VAL }, NULL, 0,
+	    do_set_res, NULL,
+	    dld_maxbw_get, do_check_maxbw, PD_CHECK_ALLOC,
+	    DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE },
 
+	{ "cpus", { "--", RESET_VAL }, NULL, 0,
+	    do_set_cpus, NULL,
+	    dld_cpus_get, do_check_cpus, 0,
+	    DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE },
+
+	{ "priority", { "high", RESET_VAL },
+	    link_priority_vals, VALCNT(link_priority_vals), do_set_res, NULL,
+	    dld_priority_get, do_check_priority, PD_CHECK_ALLOC,
+	    DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE },
 };
 
 #define	DLADM_MAX_PROPS	(sizeof (prop_table) / sizeof (prop_desc_t))
 
+static resource_prop_t rsrc_prop_table[] = {
+	{"maxbw",	do_extract_maxbw},
+	{"priority",	do_extract_priority},
+	{"cpus",	do_extract_cpus}
+};
+#define	DLADM_MAX_RSRC_PROP (sizeof (rsrc_prop_table) / \
+	sizeof (resource_prop_t))
+
 /*
  * when retrieving  private properties, we pass down a buffer with
  * DLADM_PROP_BUF_CHUNK of space for the driver to return the property value.
@@ -477,6 +519,9 @@ static dladm_status_t	i_dladm_set_linkprop(datalink_id_t, const char *,
 			    char **, uint_t, uint_t);
 static dladm_status_t	i_dladm_getset_defval(prop_desc_t *, datalink_id_t,
 			    datalink_media_t, uint_t);
+
+static dladm_status_t	link_proplist_check(dladm_arg_list_t *);
+
 /*
  * Unfortunately, MAX_SCAN_SUPPORT_RATES is too small to allow all
  * rates to be retrieved. However, we cannot increase it at this
@@ -539,17 +584,13 @@ i_dladm_set_single_prop(datalink_id_t linkid, datalink_class_t class,
 	if (pdp->pd_set == NULL)
 		return (DLADM_STATUS_PROPRDONLY);
 
-	if (pdp->pd_flags & PD_CHECK_ALLOC)
-		needfree = B_TRUE;
-	else
-		needfree = B_FALSE;
 	if (prop_val != NULL) {
 		vdp = malloc(sizeof (val_desc_t) * val_cnt);
 		if (vdp == NULL)
 			return (DLADM_STATUS_NOMEM);
 
-
 		if (pdp->pd_check != NULL) {
+			needfree = ((pdp->pd_flags & PD_CHECK_ALLOC) != 0);
 			status = pdp->pd_check(pdp, linkid, prop_val, val_cnt,
 			    vdp, media);
 		} else if (pdp->pd_optval != NULL) {
@@ -563,23 +604,25 @@ i_dladm_set_single_prop(datalink_id_t linkid, datalink_class_t class,
 
 		cnt = val_cnt;
 	} else {
+		boolean_t	defval = B_FALSE;
+
 		if (pdp->pd_defval.vd_name == NULL)
 			return (DLADM_STATUS_NOTSUP);
 
 		cnt = 1;
-		if ((pdp->pd_flags & PD_CHECK_ALLOC) != 0 ||
-		    strlen(pdp->pd_defval.vd_name) > 0) {
+		defval = (strlen(pdp->pd_defval.vd_name) > 0);
+		if ((pdp->pd_flags & PD_CHECK_ALLOC) != 0 || defval) {
 			if ((vdp = malloc(sizeof (val_desc_t))) == NULL)
 				return (DLADM_STATUS_NOMEM);
 
-			if (pdp->pd_check != NULL) {
+			if (defval) {
+				(void) memcpy(vdp, &pdp->pd_defval,
+				    sizeof (val_desc_t));
+			} else if (pdp->pd_check != NULL) {
 				status = pdp->pd_check(pdp, linkid, prop_val,
 				    cnt, vdp, media);
 				if (status != DLADM_STATUS_OK)
 					goto done;
-			} else {
-				(void) memcpy(vdp, &pdp->pd_defval,
-				    sizeof (val_desc_t));
 			}
 		} else {
 			status = i_dladm_getset_defval(pdp, linkid,
@@ -618,7 +661,6 @@ i_dladm_set_linkprop(datalink_id_t linkid, const char *prop_name,
 		if (prop_name != NULL &&
 		    (strcasecmp(prop_name, pdp->pd_name) != 0))
 			continue;
-
 		found = B_TRUE;
 		s = i_dladm_set_single_prop(linkid, class, media, pdp, prop_val,
 		    val_cnt, flags);
@@ -774,16 +816,8 @@ dladm_get_linkprop(datalink_id_t linkid, dladm_prop_type_t type,
 		}
 
 		*prop_val[0] = '\0';
-		switch (perm_flags) {
-			case MAC_PROP_PERM_READ:
-				(void) strncpy(*prop_val, PERM_READ_ONLY,
-				    DLADM_PROP_VAL_MAX);
-				break;
-			case MAC_PROP_PERM_RW:
-				(void) strncpy(*prop_val, PERM_READ_WRITE,
-				    DLADM_PROP_VAL_MAX);
-				break;
-		}
+		if (status == DLADM_STATUS_OK)
+			(void) dladm_perm2str(perm_flags, *prop_val);
 		break;
 
 	case DLADM_PROP_VAL_DEFAULT:
@@ -879,7 +913,16 @@ done:
 static int
 i_dladm_init_linkprop(datalink_id_t linkid, void *arg)
 {
-	(void) dladm_init_linkprop(linkid, B_TRUE);
+	datalink_class_t	class;
+	dladm_status_t		status;
+
+	status = dladm_datalink_id2info(linkid, NULL, &class, NULL, NULL, 0);
+	if (status != DLADM_STATUS_OK)
+		return (DLADM_WALK_TERMINATE);
+
+	if ((class & (DATALINK_CLASS_VNIC | DATALINK_CLASS_VLAN)) == 0)
+		(void) dladm_init_linkprop(linkid, B_TRUE);
+
 	return (DLADM_WALK_CONTINUE);
 }
 
@@ -904,24 +947,24 @@ dladm_init_linkprop(datalink_id_t linkid, boolean_t any_media)
 
 /* ARGSUSED */
 static dladm_status_t
-do_get_zone(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
-    uint_t *perm_flags)
+do_get_zone(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
 {
-	char		zone_name[ZONENAME_MAX];
-	zoneid_t	zid;
-	dladm_status_t	status;
-	char		*cp;
+	char			zone_name[ZONENAME_MAX];
+	zoneid_t		zid;
+	dladm_status_t		status;
+	char			*cp;
 	dld_ioc_macprop_t	*dip;
 
 	if (flags != 0)
 		return (DLADM_STATUS_NOTSUP);
 
-	dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status);
+	dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+	    &status, perm_flags);
 	if (status != DLADM_STATUS_OK)
 		return (status);
 
-	*perm_flags = dip->pr_perm_flags;
 	cp = dip->pr_val;
 	(void) memcpy(&zid, cp, sizeof (zid));
 	free(dip);
@@ -929,14 +972,12 @@ do_get_zone(struct prop_desc *pd, datalink_id_t linkid,
 	*val_cnt = 1;
 	if (zid != GLOBAL_ZONEID) {
 		if (getzonenamebyid(zid, zone_name, sizeof (zone_name)) < 0) {
-			*perm_flags = 0;
 			return (dladm_errno2status(errno));
 		}
 
 		(void) strncpy(*prop_val, zone_name, DLADM_PROP_VAL_MAX);
 	} else {
 		*prop_val[0] = '\0';
-		*perm_flags = 0;
 	}
 
 	return (DLADM_STATUS_OK);
@@ -1011,13 +1052,13 @@ cleanup:
 
 /* ARGSUSED */
 static dladm_status_t
-do_set_zone(prop_desc_t *pd, datalink_id_t linkid, val_desc_t *vdp,
+do_set_zone(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp,
     uint_t val_cnt, uint_t flags, datalink_media_t media)
 {
-	dladm_status_t	status = DLADM_STATUS_OK;
-	zoneid_t	zid_old, zid_new;
-	char		link[MAXLINKNAMELEN];
-	char		*cp;
+	dladm_status_t		status = DLADM_STATUS_OK;
+	zoneid_t		zid_old, zid_new;
+	char			link[MAXLINKNAMELEN];
+	char			*cp;
 	dld_ioc_macprop_t	*dip;
 	dld_ioc_zid_t		*dzp;
 
@@ -1026,25 +1067,14 @@ do_set_zone(prop_desc_t *pd, datalink_id_t linkid, val_desc_t *vdp,
 
 	dzp = (dld_ioc_zid_t *)vdp->vd_val;
 
-	/*
-	 * If diz_is_ppa_hack is set, then an implicit vlan must be created.
-	 * There is no old value to compare against, and vdp->vd_val is
-	 * already populated with the zoneid and linkname in the function
-	 * do_check_zone().
-	 */
-
-	if (dzp->diz_is_ppa_hack) {
-		zid_old = GLOBAL_ZONEID;
-	} else {
-		dip = i_dladm_get_public_prop(linkid, pd->pd_name,
-		    flags, &status);
-		if (status != DLADM_STATUS_OK)
-			return (status);
+	dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+	    &status, NULL);
+	if (status != DLADM_STATUS_OK)
+		return (status);
 
-		cp = dip->pr_val;
-		(void) memcpy(&zid_old, cp, sizeof (zid_old));
-		free(dip);
-	}
+	cp = dip->pr_val;
+	(void) memcpy(&zid_old, cp, sizeof (zid_old));
+	free(dip);
 
 	zid_new = dzp->diz_zid;
 	(void) strlcpy(link, dzp->diz_link, MAXLINKNAMELEN);
@@ -1066,7 +1096,7 @@ do_set_zone(prop_desc_t *pd, datalink_id_t linkid, val_desc_t *vdp,
 		 * link and prevent a link renaming, so we need to do it
 		 * before other operations.
 		 */
-		status = i_dladm_set_public_prop(pd, linkid, vdp, val_cnt,
+		status = i_dladm_set_public_prop(pdp, linkid, vdp, val_cnt,
 		    flags, media);
 		if (status != DLADM_STATUS_OK)
 			return (status);
@@ -1092,16 +1122,9 @@ do_set_zone(prop_desc_t *pd, datalink_id_t linkid, val_desc_t *vdp,
 			goto rollback2;
 		}
 
-		if (dzp->diz_is_ppa_hack) {
-			if ((status = dladm_name2info(link, &linkid, NULL, NULL,
-			    NULL)) != DLADM_STATUS_OK) {
-				return (status);
-			}
-		}
-
 		(void) i_dladm_update_deventry(zid_new, linkid, B_TRUE);
 	} else {
-		status = i_dladm_set_public_prop(pd, linkid, vdp, val_cnt,
+		status = i_dladm_set_public_prop(pdp, linkid, vdp, val_cnt,
 		    flags, media);
 		if (status != DLADM_STATUS_OK)
 			goto rollback2;
@@ -1117,7 +1140,7 @@ rollback2:
 rollback1:
 	if (zid_new != GLOBAL_ZONEID) {
 		dzp->diz_zid = zid_old;
-		(void) i_dladm_set_public_prop(pd, linkid, vdp, val_cnt,
+		(void) i_dladm_set_public_prop(pdp, linkid, vdp, val_cnt,
 		    flags, media);
 	}
 
@@ -1126,15 +1149,13 @@ rollback1:
 
 /* ARGSUSED */
 static dladm_status_t
-do_check_zone(struct prop_desc *pd, datalink_id_t linkid, char **prop_val,
+do_check_zone(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val,
     uint_t val_cnt, val_desc_t *vdp, datalink_media_t media)
 {
 	char		*zone_name;
 	char		linkname[MAXLINKNAMELEN];
 	zoneid_t	zoneid;
-	char		*cp;
 	dladm_status_t	status = DLADM_STATUS_OK;
-	boolean_t	is_ppa_hack = B_FALSE;
 	dld_ioc_zid_t	*dzp;
 
 	if (val_cnt != 1)
@@ -1144,32 +1165,12 @@ do_check_zone(struct prop_desc *pd, datalink_id_t linkid, char **prop_val,
 	if (dzp == NULL)
 		return (DLADM_STATUS_NOMEM);
 
-	if (prop_val) {
-		/*
-		 * The prop_val contains zone_name{:linkname}. The linkname is
-		 * present only when the link is a ppa-hacked vlan.
-		 */
-		cp = strchr(*prop_val, ':');
-		if (cp) {
-			(void) strlcpy(linkname, cp + 1, MAXLINKNAMELEN);
-			*cp = '\0';
-			is_ppa_hack = B_TRUE;
-		} else {
-			status = dladm_datalink_id2info(linkid, NULL, NULL,
-			    NULL, linkname, MAXLINKNAMELEN);
-			if (status != DLADM_STATUS_OK) {
-				goto done;
-			}
-		}
-		zone_name = *prop_val;
-	} else {
-		zone_name = GLOBAL_ZONENAME;
-		if ((status = dladm_datalink_id2info(linkid, NULL, NULL, NULL,
-		    linkname, MAXLINKNAMELEN)) != DLADM_STATUS_OK) {
-			goto done;
-		}
+	if ((status = dladm_datalink_id2info(linkid, NULL, NULL, NULL,
+	    linkname, MAXLINKNAMELEN)) != DLADM_STATUS_OK) {
+		goto done;
 	}
 
+	zone_name = (prop_val != NULL) ? *prop_val : GLOBAL_ZONENAME;
 	if (strlen(linkname) > MAXLINKNAMELEN) {
 		status = DLADM_STATUS_BADVAL;
 		goto done;
@@ -1199,7 +1200,6 @@ do_check_zone(struct prop_desc *pd, datalink_id_t linkid, char **prop_val,
 
 	dzp->diz_zid = zoneid;
 	(void) strlcpy(dzp->diz_link, linkname, MAXLINKNAMELEN);
-	dzp->diz_is_ppa_hack = is_ppa_hack;
 
 	vdp->vd_val = (uintptr_t)dzp;
 	return (DLADM_STATUS_OK);
@@ -1210,9 +1210,359 @@ done:
 
 /* ARGSUSED */
 static dladm_status_t
-do_get_autopush(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
-    uint_t *perm_flags)
+dld_maxbw_get(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
+{
+	dld_ioc_macprop_t	*dip;
+	mac_resource_props_t	mrp;
+	dladm_status_t		status;
+
+	dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+	    &status, perm_flags);
+	if (dip == NULL)
+		return (status);
+
+	bcopy(dip->pr_val, &mrp, sizeof (mac_resource_props_t));
+	free(dip);
+
+	if ((mrp.mrp_mask & MRP_MAXBW) == 0) {
+		(*prop_val)[0] = '\0';
+	} else {
+		(void) dladm_bw2str(mrp.mrp_maxbw, prop_val[0]);
+	}
+	*val_cnt = 1;
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_check_maxbw(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val,
+    uint_t val_cnt, val_desc_t *vdp, datalink_media_t media)
+{
+	uint64_t	*maxbw;
+	dladm_status_t	status = DLADM_STATUS_OK;
+
+	if (val_cnt != 1)
+		return (DLADM_STATUS_BADVALCNT);
+
+	maxbw = malloc(sizeof (uint64_t));
+	if (maxbw == NULL)
+		return (DLADM_STATUS_NOMEM);
+
+	status = dladm_str2bw(*prop_val, maxbw);
+	if (status != DLADM_STATUS_OK) {
+		free(maxbw);
+		return (status);
+	}
+
+	if ((*maxbw < MRP_MAXBW_MINVAL) && (*maxbw != 0)) {
+		free(maxbw);
+		return (DLADM_STATUS_MINMAXBW);
+	}
+
+	vdp->vd_val = (uintptr_t)maxbw;
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+dladm_status_t
+do_extract_maxbw(val_desc_t *vdp, void *arg, uint_t cnt)
+{
+	mac_resource_props_t *mrp = (mac_resource_props_t *)arg;
+
+	bcopy((char *)vdp->vd_val, &mrp->mrp_maxbw, sizeof (uint64_t));
+	mrp->mrp_mask |= MRP_MAXBW;
+
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+dld_cpus_get(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
+{
+	dld_ioc_macprop_t	*dip;
+	mac_resource_props_t	mrp;
+	int			i;
+	uint32_t		ncpus;
+	uchar_t			*cp;
+	dladm_status_t		status;
+
+	dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+	    &status, perm_flags);
+	if (dip == NULL)
+		return (status);
+
+	cp = (uchar_t *)dip->pr_val;
+	(void) memcpy(&mrp, cp, sizeof (mac_resource_props_t));
+	free(dip);
+
+	ncpus = mrp.mrp_ncpus;
+
+	if (ncpus > *val_cnt)
+		return (DLADM_STATUS_TOOSMALL);
+
+	if (ncpus == 0) {
+		(*prop_val)[0] = '\0';
+		*val_cnt = 1;
+		return (DLADM_STATUS_OK);
+	}
+
+	*val_cnt = ncpus;
+	for (i = 0; i < ncpus; i++) {
+		(void) snprintf(prop_val[i], DLADM_PROP_VAL_MAX,
+		    "%u", mrp.mrp_cpu[i]);
+	}
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_set_res(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp,
+    uint_t val_cnt, uint_t flags, datalink_media_t media)
+{
+	mac_resource_props_t	mrp;
+	dladm_status_t		status = DLADM_STATUS_OK;
+	dld_ioc_macprop_t	*dip;
+
+	bzero(&mrp, sizeof (mac_resource_props_t));
+	dip = i_dladm_buf_alloc_by_name(0, linkid, pdp->pd_name,
+	    flags, &status);
+
+	if (dip == NULL)
+		return (status);
+
+	if (vdp->vd_val == RESET_VAL) {
+		switch (dip->pr_num) {
+		case MAC_PROP_MAXBW:
+			mrp.mrp_maxbw = MRP_MAXBW_RESETVAL;
+			mrp.mrp_mask = MRP_MAXBW;
+			break;
+		case MAC_PROP_PRIO:
+			mrp.mrp_priority = MPL_RESET;
+			mrp.mrp_mask = MRP_PRIORITY;
+			break;
+		default:
+			free(dip);
+			return (DLADM_STATUS_BADARG);
+		}
+	} else {
+		switch (dip->pr_num) {
+		case MAC_PROP_MAXBW:
+			bcopy((void *)vdp->vd_val, &mrp.mrp_maxbw,
+			    sizeof (uint64_t));
+			mrp.mrp_mask = MRP_MAXBW;
+			break;
+		case MAC_PROP_PRIO:
+			bcopy((void *)vdp->vd_val, &mrp.mrp_priority,
+			    sizeof (mac_priority_level_t));
+			mrp.mrp_mask = MRP_PRIORITY;
+			break;
+		default:
+			free(dip);
+			return (DLADM_STATUS_BADARG);
+		}
+	}
+
+	(void) memcpy(dip->pr_val, &mrp, dip->pr_valsize);
+	status = i_dladm_macprop(dip, B_TRUE);
+	free(dip);
+	return (status);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_set_cpus(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp,
+    uint_t val_cnt, uint_t flags, datalink_media_t media)
+{
+	mac_resource_props_t	mrp;
+	dladm_status_t		status;
+	dld_ioc_macprop_t	*dip;
+	datalink_class_t	class;
+
+	/*
+	 * CPU bindings can be set on VNIC and regular physical links.
+	 * However VNICs fails the dladm_phys_info test(). So apply
+	 * the phys_info test only on physical links.
+	 */
+	if ((status = dladm_datalink_id2info(linkid, NULL, &class,
+	    NULL, NULL, 0)) != DLADM_STATUS_OK) {
+		return (status);
+	}
+
+	/*
+	 * We set intr_cpu to -1. The interrupt will be retargetted,
+	 * if possible when the setup is complete in MAC.
+	 */
+	bzero(&mrp, sizeof (mac_resource_props_t));
+	mrp.mrp_mask = MRP_CPUS;
+	if (vdp != NULL && vdp->vd_val != RESET_VAL) {
+		mac_resource_props_t	*vmrp;
+
+		vmrp = (mac_resource_props_t *)vdp->vd_val;
+		if (vmrp->mrp_ncpus > 0) {
+			bcopy(vmrp, &mrp, sizeof (mac_resource_props_t));
+			mrp.mrp_mask = MRP_CPUS;
+		}
+		mrp.mrp_mask |= MRP_CPUS_USERSPEC;
+		mrp.mrp_fanout_mode = MCM_CPUS;
+		mrp.mrp_intr_cpu = -1;
+	}
+
+	dip = i_dladm_buf_alloc_by_name(0, linkid, pdp->pd_name,
+	    flags, &status);
+	if (dip == NULL)
+		return (status);
+
+	(void) memcpy(dip->pr_val, &mrp, dip->pr_valsize);
+	status = i_dladm_macprop(dip, B_TRUE);
+	free(dip);
+	return (status);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_check_cpus(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val,
+    uint_t val_cnt, val_desc_t *vdp, datalink_media_t media)
+{
+	uint32_t		cpuid;
+	int			i, j, rc;
+	long			nproc = sysconf(_SC_NPROCESSORS_CONF);
+	mac_resource_props_t	*mrp;
+
+	mrp = malloc(sizeof (mac_resource_props_t));
+	if (mrp == NULL)
+		return (DLADM_STATUS_NOMEM);
+
+	for (i = 0; i < val_cnt; i++) {
+		errno = 0;
+		cpuid = strtol(prop_val[i], (char **)NULL, 10);
+		if (errno != 0 || cpuid >= nproc) {
+			free(mrp);
+			return (DLADM_STATUS_CPUMAX);
+		}
+		rc = p_online(cpuid, P_STATUS);
+		if (rc < 1) {
+			free(mrp);
+			return (DLADM_STATUS_CPUERR);
+		}
+		if (rc != P_ONLINE) {
+			free(mrp);
+			return (DLADM_STATUS_CPUNOTONLINE);
+		}
+		mrp->mrp_cpu[i] = cpuid;
+	}
+	mrp->mrp_ncpus = (uint32_t)val_cnt;
+
+	/* Check for duplicates */
+	for (i = 0; i < val_cnt; i++) {
+		for (j = 0; j < val_cnt; j++) {
+			if (i != j && mrp->mrp_cpu[i] == mrp->mrp_cpu[j]) {
+				free(mrp);
+				return (DLADM_STATUS_BADARG);
+			}
+		}
+	}
+	vdp->vd_val = (uintptr_t)mrp;
+
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+dladm_status_t
+do_extract_cpus(val_desc_t *vdp, void *arg, uint_t cnt)
+{
+	mac_resource_props_t	*mrp = (mac_resource_props_t *)arg;
+	mac_resource_props_t	*vmrp = (mac_resource_props_t *)vdp->vd_val;
+	int			i;
+
+	for (i = 0; i < vmrp->mrp_ncpus; i++) {
+		mrp->mrp_cpu[i] = vmrp->mrp_cpu[i];
+	}
+	mrp->mrp_ncpus = vmrp->mrp_ncpus;
+	mrp->mrp_mask |= (MRP_CPUS|MRP_CPUS_USERSPEC);
+	mrp->mrp_fanout_mode = MCM_CPUS;
+
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+dld_priority_get(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
+{
+	dld_ioc_macprop_t	*dip;
+	mac_resource_props_t	mrp;
+	mac_priority_level_t	pri;
+	dladm_status_t		status;
+
+	dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+	    &status, perm_flags);
+	if (dip == NULL)
+		return (status);
+
+	bcopy(dip->pr_val, &mrp, sizeof (mac_resource_props_t));
+	free(dip);
+
+	pri = ((mrp.mrp_mask & MRP_PRIORITY) == 0) ? MPL_HIGH :
+	    mrp.mrp_priority;
+
+	(void) dladm_pri2str(pri, prop_val[0]);
+	*val_cnt = 1;
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_check_priority(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val,
+    uint_t val_cnt, val_desc_t *vdp, datalink_media_t media)
+{
+	mac_priority_level_t	*pri;
+	dladm_status_t	status = DLADM_STATUS_OK;
+
+	if (val_cnt != 1)
+		return (DLADM_STATUS_BADVALCNT);
+
+	pri = malloc(sizeof (mac_priority_level_t));
+	if (pri == NULL)
+		return (DLADM_STATUS_NOMEM);
+
+	status = dladm_str2pri(*prop_val, pri);
+	if (status != DLADM_STATUS_OK) {
+		free(pri);
+		return (status);
+	}
+
+	if (*pri < MPL_LOW || *pri > MPL_HIGH) {
+		free(pri);
+		return (DLADM_STATUS_BADVAL);
+	}
+
+	vdp->vd_val = (uintptr_t)pri;
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+dladm_status_t
+do_extract_priority(val_desc_t *vdp, void *arg, uint_t cnt)
+{
+	mac_resource_props_t *mrp = (mac_resource_props_t *)arg;
+
+	bcopy((char *)vdp->vd_val, &mrp->mrp_priority,
+	    sizeof (mac_priority_level_t));
+	mrp->mrp_mask |= MRP_PRIORITY;
+
+	return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+do_get_autopush(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
 {
 	struct		dlautopush dlap;
 	int		i, len;
@@ -1223,10 +1573,11 @@ do_get_autopush(struct prop_desc *pd, datalink_id_t linkid,
 		return (DLADM_STATUS_NOTDEFINED);
 
 	*val_cnt = 1;
-	dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status);
+	dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+	    &status, perm_flags);
 	if (dip == NULL) {
 		(*prop_val)[0] = '\0';
-		goto done;
+		return (DLADM_STATUS_OK);
 	}
 	(void) memcpy(&dlap, dip->pr_val, sizeof (dlap));
 
@@ -1246,8 +1597,6 @@ do_get_autopush(struct prop_desc *pd, datalink_id_t linkid,
 			len += (strlen(AP_ANCHOR) + 1);
 		}
 	}
-
-	*perm_flags = dip->pr_perm_flags;
 	free(dip);
 done:
 	return (DLADM_STATUS_OK);
@@ -1292,7 +1641,7 @@ i_dladm_add_ap_module(const char *module, struct dlautopush *dlap)
  */
 /* ARGSUSED */
 static dladm_status_t
-do_check_autopush(struct prop_desc *pd, datalink_id_t linkid, char **prop_val,
+do_check_autopush(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val,
     uint_t val_cnt, val_desc_t *vdp, datalink_media_t media)
 {
 	char			*module;
@@ -1331,8 +1680,8 @@ do_check_autopush(struct prop_desc *pd, datalink_id_t linkid, char **prop_val,
 
 /* ARGSUSED */
 static dladm_status_t
-do_get_rate_common(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, uint_t id)
+do_get_rate_common(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, uint_t id, uint_t *perm_flags)
 {
 	wl_rates_t	*wrp;
 	uint_t		i;
@@ -1363,6 +1712,7 @@ do_get_rate_common(struct prop_desc *pd, datalink_id_t linkid,
 		    (float)wrp->wl_rates_rates[i] / 2);
 	}
 	*val_cnt = wrp->wl_rates_num;
+	*perm_flags = MAC_PROP_PERM_RW;
 
 done:
 	free(wrp);
@@ -1370,29 +1720,25 @@ done:
 }
 
 static dladm_status_t
-do_get_rate_prop(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
-    uint_t *perm_flags)
+do_get_rate_prop(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
 {
 	if (media != DL_WIFI) {
-		*perm_flags = MAC_PROP_PERM_READ;
-		return (i_dladm_speed_get(pd, linkid, prop_val,
-		    val_cnt, flags));
+		return (i_dladm_speed_get(pdp, linkid, prop_val,
+		    val_cnt, flags, perm_flags));
 	}
 
-	*perm_flags = MAC_PROP_PERM_RW;
-	return (do_get_rate_common(pd, linkid, prop_val, val_cnt,
-	    MAC_PROP_WL_DESIRED_RATES));
+	return (do_get_rate_common(pdp, linkid, prop_val, val_cnt,
+	    MAC_PROP_WL_DESIRED_RATES, perm_flags));
 }
 
 /* ARGSUSED */
 static dladm_status_t
-do_get_rate_mod(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
-    uint_t *perm_flags)
+do_get_rate_mod(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
 {
-	*perm_flags = MAC_PROP_PERM_READ;
-
 	switch (media) {
 	case DL_ETHER:
 		/*
@@ -1402,8 +1748,8 @@ do_get_rate_mod(struct prop_desc *pd, datalink_id_t linkid,
 		return (DLADM_STATUS_NOTSUP);
 
 	case DL_WIFI:
-		return (do_get_rate_common(pd, linkid, prop_val, val_cnt,
-		    MAC_PROP_WL_SUPPORTED_RATES));
+		return (do_get_rate_common(pdp, linkid, prop_val, val_cnt,
+		    MAC_PROP_WL_SUPPORTED_RATES, perm_flags));
 	default:
 		return (DLADM_STATUS_BADARG);
 	}
@@ -1437,7 +1783,7 @@ do_set_rate(datalink_id_t linkid, dladm_wlan_rates_t *rates)
 
 /* ARGSUSED */
 static dladm_status_t
-do_set_rate_prop(prop_desc_t *pd, datalink_id_t linkid,
+do_set_rate_prop(prop_desc_t *pdp, datalink_id_t linkid,
     val_desc_t *vdp, uint_t val_cnt, uint_t flags, datalink_media_t media)
 {
 	dladm_wlan_rates_t	rates;
@@ -1463,7 +1809,7 @@ done:
 
 /* ARGSUSED */
 static dladm_status_t
-do_check_rate(struct prop_desc *pd, datalink_id_t linkid, char **prop_val,
+do_check_rate(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val,
     uint_t val_cnt, val_desc_t *vdp, datalink_media_t media)
 {
 	int		i;
@@ -1517,16 +1863,15 @@ do_get_phyconf(datalink_id_t linkid, void *buf, int buflen)
 
 /* ARGSUSED */
 static dladm_status_t
-do_get_channel_prop(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
-    uint_t *perm_flags)
+do_get_channel_prop(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
 {
 	uint32_t	channel;
 	char		buf[WLDP_BUFSIZE];
 	dladm_status_t	status = DLADM_STATUS_OK;
 	wl_phy_conf_t	wl_phy_conf;
 
-	*perm_flags = MAC_PROP_PERM_READ;
 	if ((status = do_get_phyconf(linkid, buf, sizeof (buf)))
 	    != DLADM_STATUS_OK)
 		goto done;
@@ -1539,7 +1884,7 @@ do_get_channel_prop(struct prop_desc *pd, datalink_id_t linkid,
 
 	(void) snprintf(*prop_val, DLADM_STRSIZE, "%u", channel);
 	*val_cnt = 1;
-
+	*perm_flags = MAC_PROP_PERM_READ;
 done:
 	return (status);
 }
@@ -1553,9 +1898,9 @@ do_get_powermode(datalink_id_t linkid, void *buf, int buflen)
 
 /* ARGSUSED */
 static dladm_status_t
-do_get_powermode_prop(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
-    uint_t *perm_flags)
+do_get_powermode_prop(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
 {
 	wl_ps_mode_t	mode;
 	const char	*s;
@@ -1583,12 +1928,8 @@ do_get_powermode_prop(struct prop_desc *pd, datalink_id_t linkid,
 	}
 	(void) snprintf(*prop_val, DLADM_STRSIZE, "%s", s);
 	*val_cnt = 1;
-
+	*perm_flags = MAC_PROP_PERM_RW;
 done:
-	if (status == DLADM_STATUS_OK)
-		*perm_flags = MAC_PROP_PERM_RW;
-	else
-		*perm_flags = 0;
 	return (status);
 }
 
@@ -1618,7 +1959,7 @@ do_set_powermode(datalink_id_t linkid, dladm_wlan_powermode_t *pm)
 
 /* ARGSUSED */
 static dladm_status_t
-do_set_powermode_prop(prop_desc_t *pd, datalink_id_t linkid,
+do_set_powermode_prop(prop_desc_t *pdp, datalink_id_t linkid,
     val_desc_t *vdp, uint_t val_cnt, uint_t flags, datalink_media_t media)
 {
 	dladm_wlan_powermode_t powermode = (dladm_wlan_powermode_t)vdp->vd_val;
@@ -1641,9 +1982,9 @@ do_get_radio(datalink_id_t linkid, void *buf, int buflen)
 
 /* ARGSUSED */
 static dladm_status_t
-do_get_radio_prop(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
-    uint_t *perm_flags)
+do_get_radio_prop(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
 {
 	wl_radio_t	radio;
 	const char	*s;
@@ -1668,12 +2009,8 @@ do_get_radio_prop(struct prop_desc *pd, datalink_id_t linkid,
 	}
 	(void) snprintf(*prop_val, DLADM_STRSIZE, "%s", s);
 	*val_cnt = 1;
-
+	*perm_flags = MAC_PROP_PERM_RW;
 done:
-	if (status == DLADM_STATUS_OK)
-		*perm_flags = MAC_PROP_PERM_RW;
-	else
-		*perm_flags = 0;
 	return (status);
 }
 
@@ -1698,7 +2035,7 @@ do_set_radio(datalink_id_t linkid, dladm_wlan_radio_t *radio)
 
 /* ARGSUSED */
 static dladm_status_t
-do_set_radio_prop(prop_desc_t *pd, datalink_id_t linkid,
+do_set_radio_prop(prop_desc_t *pdp, datalink_id_t linkid,
     val_desc_t *vdp, uint_t val_cnt, uint_t fags, datalink_media_t media)
 {
 	dladm_wlan_radio_t radio = (dladm_wlan_radio_t)vdp->vd_val;
@@ -1860,7 +2197,7 @@ i_dladm_buf_alloc_by_id(size_t valsize, datalink_id_t linkid,
 
 /* ARGSUSED */
 static dladm_status_t
-i_dladm_set_public_prop(prop_desc_t *pd, datalink_id_t linkid,
+i_dladm_set_public_prop(prop_desc_t *pdp, datalink_id_t linkid,
     val_desc_t *vdp, uint_t val_cnt, uint_t flags, datalink_media_t media)
 {
 	dld_ioc_macprop_t	*dip;
@@ -1870,11 +2207,11 @@ i_dladm_set_public_prop(prop_desc_t *pd, datalink_id_t linkid,
 	uint32_t	u32;
 	void		*val;
 
-	dip = i_dladm_buf_alloc_by_name(0, linkid, pd->pd_name, 0, &status);
+	dip = i_dladm_buf_alloc_by_name(0, linkid, pdp->pd_name, 0, &status);
 	if (dip == NULL)
 		return (status);
 
-	if (pd->pd_flags & PD_CHECK_ALLOC)
+	if (pdp->pd_flags & PD_CHECK_ALLOC)
 		val = (void *)vdp->vd_val;
 	else {
 		/*
@@ -1931,7 +2268,7 @@ i_dladm_macprop(void *dip, boolean_t set)
 
 static dld_ioc_macprop_t *
 i_dladm_get_public_prop(datalink_id_t linkid, char *prop_name, uint_t flags,
-    dladm_status_t *status)
+    dladm_status_t *status, uint_t *perm_flags)
 {
 	dld_ioc_macprop_t *dip = NULL;
 
@@ -1944,12 +2281,15 @@ i_dladm_get_public_prop(datalink_id_t linkid, char *prop_name, uint_t flags,
 		free(dip);
 		return (NULL);
 	}
+	if (perm_flags != NULL)
+		*perm_flags = dip->pr_perm_flags;
+
 	return (dip);
 }
 
 /* ARGSUSED */
 static dladm_status_t
-i_dladm_defmtu_check(struct prop_desc *pd, datalink_id_t linkid,
+i_dladm_defmtu_check(prop_desc_t *pdp, datalink_id_t linkid,
     char **prop_val, uint_t val_cnt, val_desc_t *v, datalink_media_t media)
 {
 	if (val_cnt != 1)
@@ -1960,9 +2300,9 @@ i_dladm_defmtu_check(struct prop_desc *pd, datalink_id_t linkid,
 
 /* ARGSUSED */
 static dladm_status_t
-i_dladm_duplex_get(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
-    uint_t *perm_flags)
+i_dladm_duplex_get(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
 {
 	link_duplex_t   link_duplex;
 	dladm_status_t  status;
@@ -1988,8 +2328,8 @@ i_dladm_duplex_get(struct prop_desc *pd, datalink_id_t linkid,
 
 /* ARGSUSED */
 static dladm_status_t
-i_dladm_speed_get(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, uint_t flags)
+i_dladm_speed_get(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, uint_t flags, uint_t *perm_flags)
 {
 	uint64_t	ifspeed = 0;
 	dladm_status_t status;
@@ -2006,23 +2346,26 @@ i_dladm_speed_get(struct prop_desc *pd, datalink_id_t linkid,
 		    "%llu", ifspeed / 1000000); /* Mbps */
 	}
 	*val_cnt = 1;
+	*perm_flags = MAC_PROP_PERM_READ;
 	return (DLADM_STATUS_OK);
 }
 
 /* ARGSUSED */
 static dladm_status_t
-i_dladm_status_get(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
-    uint_t *perm_flags)
+i_dladm_status_get(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
 {
-	link_state_t	link_state;
-	dladm_status_t	status;
-	uchar_t 	*cp;
-	dld_ioc_macprop_t  *dip;
+	link_state_t		link_state;
+	dladm_status_t		status;
+	uchar_t 		*cp;
+	dld_ioc_macprop_t	*dip;
 
-	dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status);
+	dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+	    &status, perm_flags);
 	if (status != DLADM_STATUS_OK)
 		return (status);
+
 	cp = (uchar_t *)dip->pr_val;
 	(void) memcpy(&link_state, cp, sizeof (link_state));
 
@@ -2038,25 +2381,25 @@ i_dladm_status_get(struct prop_desc *pd, datalink_id_t linkid,
 		break;
 	}
 	*val_cnt = 1;
-	*perm_flags = dip->pr_perm_flags;
 	free(dip);
 	return (DLADM_STATUS_OK);
 }
 
 /* ARGSUSED */
 static dladm_status_t
-i_dladm_binary_get(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
-    uint_t *perm_flags)
+i_dladm_binary_get(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
 {
 	dld_ioc_macprop_t *dip;
 	dladm_status_t status;
 
-	dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status);
+	dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+	    &status, perm_flags);
 	if (dip == NULL)
 		return (status);
+
 	(void) snprintf(*prop_val, DLADM_PROP_VAL_MAX, "%x", dip->pr_val[0]);
-	*perm_flags = dip->pr_perm_flags;
 	free(dip);
 	*val_cnt = 1;
 	return (DLADM_STATUS_OK);
@@ -2064,22 +2407,23 @@ i_dladm_binary_get(struct prop_desc *pd, datalink_id_t linkid,
 
 /* ARGSUSED */
 static dladm_status_t
-i_dladm_uint32_get(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
-    uint_t *perm_flags)
+i_dladm_uint32_get(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
 {
 	dld_ioc_macprop_t *dip;
-	uint32_t v  = 0;
+	uint32_t v = 0;
 	uchar_t *cp;
 	dladm_status_t status;
 
-	dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status);
+	dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+	    &status, perm_flags);
 	if (dip == NULL)
 		return (status);
+
 	cp = (uchar_t *)dip->pr_val;
 	(void) memcpy(&v, cp, sizeof (v));
 	(void) snprintf(*prop_val, DLADM_PROP_VAL_MAX, "%ld", v);
-	*perm_flags = dip->pr_perm_flags;
 	free(dip);
 	*val_cnt = 1;
 	return (DLADM_STATUS_OK);
@@ -2087,18 +2431,20 @@ i_dladm_uint32_get(struct prop_desc *pd, datalink_id_t linkid,
 
 /* ARGSUSED */
 static dladm_status_t
-i_dladm_flowctl_get(struct prop_desc *pd, datalink_id_t linkid,
-    char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags,
-    uint_t *perm_flags)
+i_dladm_flowctl_get(prop_desc_t *pdp, datalink_id_t linkid,
+    char **prop_val, uint_t *val_cnt, datalink_media_t media,
+    uint_t flags, uint_t *perm_flags)
 {
 	dld_ioc_macprop_t *dip;
 	link_flowctrl_t v;
 	dladm_status_t status;
 	uchar_t *cp;
 
-	dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status);
+	dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags,
+	    &status, perm_flags);
 	if (dip == NULL)
 		return (status);
+
 	cp = (uchar_t *)dip->pr_val;
 	(void) memcpy(&v, cp, sizeof (v));
 	switch (v) {
@@ -2115,7 +2461,6 @@ i_dladm_flowctl_get(struct prop_desc *pd, datalink_id_t linkid,
 		(void) sprintf(*prop_val, "bi");
 		break;
 	}
-	*perm_flags = dip->pr_perm_flags;
 	free(dip);
 	*val_cnt = 1;
 	return (DLADM_STATUS_OK);
@@ -2220,17 +2565,7 @@ i_dladm_get_prop(datalink_id_t linkid, const char *prop_name,
 
 	if ((status = i_dladm_macprop(dip, B_FALSE)) == DLADM_STATUS_OK) {
 		if (type == DLADM_PROP_VAL_PERM) {
-			switch (dip->pr_perm_flags) {
-				case MAC_PROP_PERM_READ:
-					(void) strncpy(*prop_val,
-					    PERM_READ_ONLY, DLADM_PROP_VAL_MAX);
-					break;
-				case MAC_PROP_PERM_RW:
-					(void) strncpy(*prop_val,
-					    PERM_READ_WRITE,
-					    DLADM_PROP_VAL_MAX);
-					break;
-			}
+			(void) dladm_perm2str(dip->pr_perm_flags, *prop_val);
 		} else {
 			(void) strncpy(*prop_val, dip->pr_val,
 			    DLADM_PROP_VAL_MAX);
@@ -2434,3 +2769,189 @@ i_dladm_wlan_set_legacy_ioctl(datalink_id_t linkid,  void *buf, uint_t buflen,
 	free(gbuf);
 	return (status);
 }
+
+static dladm_status_t
+link_proplist_check(dladm_arg_list_t *proplist)
+{
+	int		i, j;
+	boolean_t	matched;
+
+	for (i = 0; i < proplist->al_count; i++) {
+		matched = B_FALSE;
+		for (j = 0; j < DLADM_MAX_PROPS; j++) {
+			if (strcmp(proplist->al_info[i].ai_name,
+			    prop_table[j].pd_name) == 0)
+				matched = B_TRUE;
+		}
+		if (!matched)
+			return (DLADM_STATUS_BADPROP);
+	}
+	return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+dladm_parse_link_props(char *str, dladm_arg_list_t **listp, boolean_t novalues)
+{
+	dladm_status_t	status;
+
+	status = dladm_parse_args(str, listp, novalues);
+	if (status != DLADM_STATUS_OK)
+		return (status);
+
+	status = link_proplist_check(*listp);
+	if (status != DLADM_STATUS_OK) {
+		dladm_free_props(*listp);
+		return (status);
+	}
+
+	return (DLADM_STATUS_OK);
+}
+
+/*
+ * Retrieve the one link property from the database
+ */
+/*ARGSUSED*/
+static int
+i_dladm_get_one_prop(datalink_id_t linkid, const char *prop_name, void *arg)
+{
+	dladm_arg_list_t	*proplist = arg;
+	dladm_arg_info_t	*aip = NULL;
+
+	aip = &proplist->al_info[proplist->al_count];
+	/*
+	 * it is fine to point to prop_name since prop_name points to the
+	 * prop_table[n].pd_name.
+	 */
+	aip->ai_name = prop_name;
+
+	(void) dladm_get_linkprop(linkid, DLADM_PROP_VAL_PERSISTENT, prop_name,
+	    aip->ai_val, &aip->ai_count);
+
+	if (aip->ai_count != 0)
+		proplist->al_count++;
+
+	return (DLADM_WALK_CONTINUE);
+}
+
+
+/*
+ * Retrieve all link properties for a link from the database and
+ * return a property list.
+ */
+dladm_status_t
+dladm_link_get_proplist(datalink_id_t linkid, dladm_arg_list_t **listp)
+{
+	dladm_arg_list_t	*list;
+	dladm_status_t		status = DLADM_STATUS_OK;
+
+	list = calloc(1, sizeof (dladm_arg_list_t));
+	if (list == NULL)
+		return (dladm_errno2status(errno));
+
+	status = dladm_walk_linkprop(linkid, list, i_dladm_get_one_prop);
+
+	*listp = list;
+	return (status);
+}
+
+/*
+ * Retrieve the named property from a proplist, check the value and
+ * convert to a kernel structure.
+ */
+static dladm_status_t
+i_dladm_link_proplist_extract_one(dladm_arg_list_t *proplist,
+    const char *name, void *val)
+{
+	dladm_status_t		status;
+	dladm_arg_info_t	*aip = NULL;
+	int			i, j;
+
+	/* Find named property in proplist */
+	for (i = 0; i < proplist->al_count; i++) {
+		aip = &proplist->al_info[i];
+		if (strcasecmp(aip->ai_name, name) == 0)
+			break;
+	}
+
+	/* Property not in list */
+	if (i == proplist->al_count)
+		return (DLADM_STATUS_OK);
+
+	for (i = 0; i < DLADM_MAX_PROPS; i++) {
+		prop_desc_t	*pdp = &prop_table[i];
+		val_desc_t	*vdp;
+
+		vdp = malloc(sizeof (val_desc_t) * aip->ai_count);
+		if (vdp == NULL)
+			return (DLADM_STATUS_NOMEM);
+
+		if (strcasecmp(aip->ai_name, pdp->pd_name) != 0)
+			continue;
+
+		if (aip->ai_val == NULL)
+			return (DLADM_STATUS_BADARG);
+
+		/* Check property value */
+		if (pdp->pd_check != NULL) {
+			status = pdp->pd_check(pdp, 0, aip->ai_val,
+			    aip->ai_count, vdp, 0);
+		} else {
+			status = DLADM_STATUS_BADARG;
+		}
+
+		if (status != DLADM_STATUS_OK)
+			return (status);
+
+		for (j = 0; j < DLADM_MAX_RSRC_PROP; j++) {
+			resource_prop_t	*rpp = &rsrc_prop_table[j];
+
+			if (strcasecmp(aip->ai_name, rpp->rp_name) != 0)
+				continue;
+
+			/* Extract kernel structure */
+			if (rpp->rp_extract != NULL) {
+				status = rpp->rp_extract(vdp, val,
+				    aip->ai_count);
+			} else {
+				status = DLADM_STATUS_BADARG;
+			}
+			break;
+		}
+
+		if (status != DLADM_STATUS_OK)
+			return (status);
+
+		break;
+	}
+	return (status);
+}
+
+/*
+ * Extract properties from a proplist and convert to mac_resource_props_t.
+ */
+dladm_status_t
+dladm_link_proplist_extract(dladm_arg_list_t *proplist,
+    mac_resource_props_t *mrp)
+{
+	dladm_status_t	status = DLADM_STATUS_OK;
+
+	status = i_dladm_link_proplist_extract_one(proplist, "maxbw", mrp);
+	if (status != DLADM_STATUS_OK)
+		return (status);
+	status = i_dladm_link_proplist_extract_one(proplist, "priority", mrp);
+	if (status != DLADM_STATUS_OK)
+		return (status);
+	status = i_dladm_link_proplist_extract_one(proplist, "cpus", mrp);
+	if (status != DLADM_STATUS_OK)
+		return (status);
+	return (status);
+}
+
+static const char *
+dladm_perm2str(uint_t perm, char *buf)
+{
+	(void) snprintf(buf, DLADM_STRSIZE, "%c%c",
+	    ((perm & MAC_PROP_PERM_READ) != 0) ? 'r' : '-',
+	    ((perm & MAC_PROP_PERM_WRITE) != 0) ? 'w' : '-');
+	return (buf);
+}
diff --git a/usr/src/lib/libdladm/common/llib-ldladm b/usr/src/lib/libdladm/common/llib-ldladm
index a6fc19b517..ae8bb981bf 100644
--- a/usr/src/lib/libdladm/common/llib-ldladm
+++ b/usr/src/lib/libdladm/common/llib-ldladm
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*LINTLIBRARY*/
 /*PROTOLIB1*/
 
@@ -34,3 +32,5 @@
 #include <libdlvnic.h>
 #include <libdlvlan.h>
 #include <libdlmgmt.h>
+#include <libdlflow.h>
+#include <libdlstat.h>
diff --git a/usr/src/lib/libdladm/common/mapfile-vers b/usr/src/lib/libdladm/common/mapfile-vers
index 9c61b84883..bd8d6a9eb1 100644
--- a/usr/src/lib/libdladm/common/mapfile-vers
+++ b/usr/src/lib/libdladm/common/mapfile-vers
@@ -35,7 +35,6 @@ SUNWprivate_1.1 {
 	dladm_valid_linkname;
 	dladm_mac_walk;
 	dladm_init_linkprop;
-	dladm_get_single_mac_stat;
 	dladm_get_linkprop;
 	dladm_set_linkprop;
 	dladm_walk_linkprop;
@@ -44,6 +43,8 @@ SUNWprivate_1.1 {
 	dladm_set_secobj;
 	dladm_unset_secobj;
 	dladm_walk_secobj;
+	dladm_bw2str;
+	dladm_str2bw;
 	dladm_secobjclass2str;
 	dladm_str2secobjclass;
 	dladm_aggr_up;
@@ -118,12 +119,60 @@ SUNWprivate_1.1 {
 	dladm_wlan_wpa_set_key;
 	dladm_wlan_wpa_set_mlme;
 	dladm_vnic_create;
-	dladm_vnic_modify;
 	dladm_vnic_delete;
 	dladm_vnic_info;
 	dladm_vnic_str2macaddrtype;
-	dladm_kstat_value;
+	dladm_vnic_up;
+	dladm_walk_macaddr;
+	dladm_walk_hwgrp;
+	dladm_pri2str;
+	dladm_str2pri;
+	dladm_start_usagelog;
+	dladm_stop_usagelog;
+	dladm_walk_usage_res;
+	dladm_walk_usage_time;
+	dladm_usage_summary;
+	dladm_usage_dates;
+
+	dladm_flow_add;
+	dladm_flow_remove;
+	dladm_flow_parse_db;
+	dladm_walk_flow;
+	dladm_flow_init;
+	dladm_flow_info;
+	dladm_prefixlen2mask;
+	dladm_mask2prefixlen;
+	dladm_str2proto;
+	dladm_proto2str;
+
+	dladm_free_attrs;
+	dladm_parse_flow_attrs;
+
+	dladm_flow_attr_ip2str;
+	dladm_flow_attr_proto2str;
+	dladm_flow_attr_port2str;
+	dladm_flow_attr_dsfield2str;
+
+	dladm_free_props;
+	dladm_parse_link_props;
+	dladm_get_linkprop;
+	dladm_set_linkprop;
+	dladm_walk_linkprop;
+	dladm_parse_flow_props;
+	dladm_get_flowprop;
+	dladm_set_flowprop;
+	dladm_walk_flowprop;
+
 	dladm_parselink;
+
+	dladm_continuous;
+	dladm_kstat_lookup;
+	dladm_get_stats;
+	dladm_kstat_value;
+	dladm_get_single_mac_stat;
+	dladm_stats_total;
+	dladm_stats_diff;
+
     local:
 	*;
 };
diff --git a/usr/src/lib/libdladm/common/propfuncs.c b/usr/src/lib/libdladm/common/propfuncs.c
new file mode 100644
index 0000000000..74964511eb
--- /dev/null
+++ b/usr/src/lib/libdladm/common/propfuncs.c
@@ -0,0 +1,699 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdlib.h>
+#include <strings.h>
+#include <errno.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/dld.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <libdladm_impl.h>
+#include <libdlflow_impl.h>
+
+/*
+ * XXX duplicate defines
+ */
+#define	DLADM_PROP_VAL_MAX	32
+#define	DLADM_MAX_PROPS		32
+
+static void
+free_props(prop_db_info_t *lip)
+{
+	prop_db_info_t	*lip_next;
+	prop_val_t	*lvp, *lvp_next;
+
+	for (; lip != NULL; lip = lip_next) {
+		lip_next = lip->li_nextprop;
+		for (lvp = lip->li_val; lvp != NULL; lvp = lvp_next) {
+			lvp_next = lvp->lv_nextval;
+			free(lvp);
+		}
+		free(lip);
+	}
+}
+
+/*
+ * Generate an entry in the property database.
+ * Each entry has this format:
+ * <name>	<prop0>=<val0>,...,<valn>;...;<propn>=<val0>,...,<valn>;
+ */
+static void
+generate_prop_line(const char *name, char *buf,
+    prop_db_info_t *listp, dladm_status_t *statusp)
+{
+	char		tmpbuf[MAXLINELEN];
+	char		*ptr, *lim = tmpbuf + MAXLINELEN;
+	prop_db_info_t	*lip = listp;
+	prop_val_t	*lvp = NULL;
+
+	/*
+	 * Delete line if there are no properties left.
+	 */
+	if (lip == NULL ||
+	    (lip->li_val == NULL && lip->li_nextprop == NULL)) {
+		buf[0] = '\0';
+		return;
+	}
+	ptr = tmpbuf;
+	ptr += snprintf(ptr, BUFLEN(lim, ptr), "%s\t", name);
+	for (; lip != NULL; lip = lip->li_nextprop) {
+		/*
+		 * Skip properties without values.
+		 */
+		if (lip->li_val == NULL)
+			continue;
+
+		ptr += snprintf(ptr, BUFLEN(lim, ptr), "%s=", lip->li_name);
+		for (lvp = lip->li_val; lvp != NULL; lvp = lvp->lv_nextval) {
+			ptr += snprintf(ptr, BUFLEN(lim, ptr), "%s%c",
+			    lvp->lv_name,
+			    ((lvp->lv_nextval == NULL) ? ';' : ','));
+		}
+	}
+	if (ptr > lim) {
+		*statusp = DLADM_STATUS_TOOSMALL;
+		return;
+	}
+	(void) snprintf(buf, MAXLINELEN, "%s\n", tmpbuf);
+}
+
+/*
+ * This function is used to update or create an entry in the persistent db.
+ * process_prop_db() will first scan the db for an entry matching the
+ * specified name. If a match is found, this function is invoked with the
+ * entry's contents (buf) and its linked-list representation (listp). lsp
+ * holds the name and values of the property to be added or updated; this
+ * information will be merged with listp. Subsequently, an updated entry
+ * will be written to buf, which will in turn be written to disk by
+ * process_prop_db(). If no entry matches the specified name, listp
+ * will be NULL; a new entry will be generated in this case and it will
+ * contain only the property information in lsp.
+ */
+boolean_t
+process_prop_set(prop_db_state_t *lsp, char *buf,
+    prop_db_info_t *listp, dladm_status_t *statusp)
+{
+	dladm_status_t	status;
+	prop_db_info_t	*lastp = NULL, *lip = listp, *nlip = NULL;
+	prop_val_t	**lvpp;
+	int		i;
+
+	if (lsp->ls_propname == NULL) {
+		buf[0] = '\0';
+		return (B_FALSE);
+	}
+
+	/*
+	 * Find the prop we want to change.
+	 */
+	for (; lip != NULL; lip = lip->li_nextprop) {
+		if (strcmp(lip->li_name, lsp->ls_propname) == 0)
+			break;
+
+		lastp = lip;
+	}
+
+	if (lip == NULL) {
+		/*
+		 * If the prop is not found, append it to the list.
+		 */
+		if ((nlip = malloc(sizeof (prop_db_info_t))) == NULL) {
+			status = DLADM_STATUS_NOMEM;
+			goto fail;
+		}
+		/*
+		 * nlip will need to be freed later if there is no list to
+		 * append to.
+		 */
+		if (lastp != NULL)
+			lastp->li_nextprop = nlip;
+		nlip->li_name = lsp->ls_propname;
+		nlip->li_nextprop = NULL;
+		nlip->li_val = NULL;
+		lvpp = &nlip->li_val;
+	} else {
+		prop_val_t	*lvp, *lvp_next;
+
+		/*
+		 * If the prop is found, delete the existing values from it.
+		 */
+		for (lvp = lip->li_val; lvp != NULL; lvp = lvp_next) {
+			lvp_next = lvp->lv_nextval;
+			free(lvp);
+		}
+		lip->li_val = NULL;
+		lvpp = &lip->li_val;
+	}
+
+	/*
+	 * Fill our prop with the specified values.
+	 */
+	for (i = 0; i < *lsp->ls_valcntp; i++) {
+		if ((*lvpp = malloc(sizeof (prop_val_t))) == NULL) {
+			status = DLADM_STATUS_NOMEM;
+			goto fail;
+		}
+		(*lvpp)->lv_name = lsp->ls_propval[i];
+		(*lvpp)->lv_nextval = NULL;
+		lvpp = &(*lvpp)->lv_nextval;
+	}
+
+	if (listp != NULL) {
+		generate_prop_line(lsp->ls_name, buf, listp, statusp);
+	} else {
+		generate_prop_line(lsp->ls_name, buf, nlip, statusp);
+		free_props(nlip);
+	}
+	return (B_FALSE);
+
+fail:
+	*statusp = status;
+	if (listp == NULL)
+		free_props(nlip);
+
+	return (B_FALSE);
+}
+
+/*
+ * This function is used for retrieving the values for a specific property.
+ * It gets called if an entry matching the specified name exists in the db.
+ * The entry is converted into a linked-list listp. This list is then scanned
+ * for the specified property name; if a matching property exists, its
+ * associated values are copied to the array lsp->ls_propval.
+ */
+/* ARGSUSED */
+boolean_t
+process_prop_get(prop_db_state_t *lsp, char *buf,
+    prop_db_info_t *listp, dladm_status_t *statusp)
+{
+	prop_db_info_t	*lip = listp;
+	prop_val_t	*lvp;
+	uint_t		valcnt = 0;
+
+	/*
+	 * Find the prop we want to get.
+	 */
+	for (; lip != NULL; lip = lip->li_nextprop) {
+		if (strcmp(lip->li_name, lsp->ls_propname) == 0)
+			break;
+	}
+	if (lip == NULL) {
+		*statusp = DLADM_STATUS_NOTFOUND;
+		return (B_FALSE);
+	}
+
+	for (lvp = lip->li_val; lvp != NULL; lvp = lvp->lv_nextval) {
+		(void) strncpy(lsp->ls_propval[valcnt], lvp->lv_name,
+		    DLADM_PROP_VAL_MAX);
+
+		if (++valcnt >= *lsp->ls_valcntp && lvp->lv_nextval != NULL) {
+			*statusp = DLADM_STATUS_TOOSMALL;
+			return (B_FALSE);
+		}
+	}
+	/*
+	 * This function is meant to be called at most once for each call
+	 * to process_prop_db(). For this reason, it's ok to overwrite
+	 * the caller's valcnt array size with the actual number of values
+	 * returned.
+	 */
+	*lsp->ls_valcntp = valcnt;
+	return (B_FALSE);
+}
+
+/*
+ * This is used for initializing properties.
+ * Unlike the other routines, this gets called for every entry in the
+ * database. lsp->ls_name is not user-specified but instead is set to
+ * the current name being processed.
+ */
+/* ARGSUSED */
+boolean_t
+process_prop_init(prop_db_state_t *lsp, char *buf,
+    prop_db_info_t *listp, dladm_status_t *statusp)
+{
+	dladm_status_t	status = DLADM_STATUS_OK;
+	prop_db_info_t	*lip = listp;
+	prop_val_t	*lvp;
+	uint_t		valcnt, i;
+	char		**propval;
+
+	for (; lip != NULL; lip = lip->li_nextprop) {
+		/*
+		 * Construct the propval array and fill it with
+		 * values from listp.
+		 */
+		for (lvp = lip->li_val, valcnt = 0;
+		    lvp != NULL; lvp = lvp->lv_nextval, valcnt++) {
+		}
+
+		propval = malloc(sizeof (char *) * valcnt);
+		if (propval == NULL) {
+			*statusp = DLADM_STATUS_NOMEM;
+			break;
+		}
+		lvp = lip->li_val;
+		for (i = 0; i < valcnt; i++, lvp = lvp->lv_nextval)
+			propval[i] = (char *)lvp->lv_name;
+
+		status = (*lsp->ls_initop)(lsp->ls_name, lip->li_name,
+		    propval, valcnt, DLADM_OPT_ACTIVE, NULL);
+
+		/*
+		 * We continue with initializing other properties even
+		 * after encountering an error. This error will be
+		 * propagated to the caller via 'statusp'.
+		 */
+		if (status != DLADM_STATUS_OK)
+			*statusp = status;
+
+		free(propval);
+	}
+	return (B_TRUE);
+}
+
+static int
+parse_props(char *buf, prop_db_info_t **lipp)
+{
+	int			i, len;
+	char			*curr;
+	prop_db_info_t		*lip = NULL;
+	prop_db_info_t		**tailp = lipp;
+	prop_val_t		*lvp = NULL;
+	prop_val_t		**vtailp = NULL;
+
+	curr = buf;
+	len = strlen(buf);
+	for (i = 0; i < len; i++) {
+		char		c = buf[i];
+		boolean_t	match = (c == '=' || c == ',' || c == ';');
+
+		/*
+		 * Move to the next character if there is no match and
+		 * if we have not reached the last character.
+		 */
+		if (!match && i != len - 1)
+			continue;
+
+		if (match) {
+			/*
+			 * Nul-terminate the string pointed to by 'curr'.
+			 */
+			buf[i] = '\0';
+			if (*curr == '\0')
+				goto fail;
+		}
+
+		if (lip != NULL) {
+			/*
+			 * We get here after we have processed the "<prop>="
+			 * pattern. The pattern we are now interested in is
+			 * "<val0>,<val1>,...,<valn>;". For each value we
+			 * find, a prop_val_t will be allocated and
+			 * added to the current 'lip'.
+			 */
+			if (c == '=')
+				goto fail;
+
+			lvp = malloc(sizeof (*lvp));
+			if (lvp == NULL)
+				goto fail;
+
+			lvp->lv_name = curr;
+			lvp->lv_nextval = NULL;
+			*vtailp = lvp;
+			vtailp = &lvp->lv_nextval;
+
+			if (c == ';') {
+				tailp = &lip->li_nextprop;
+				vtailp = NULL;
+				lip = NULL;
+			}
+		} else {
+			/*
+			 * lip == NULL indicates that 'curr' must be refering
+			 * to a property name. We allocate a new prop_db_info_t
+			 * append it to the list given by the caller.
+			 */
+			if (c != '=')
+				goto fail;
+
+			lip = malloc(sizeof (*lip));
+			if (lip == NULL)
+				goto fail;
+
+			lip->li_name = curr;
+			lip->li_val = NULL;
+			lip->li_nextprop = NULL;
+			*tailp = lip;
+			vtailp = &lip->li_val;
+		}
+		curr = buf + i + 1;
+	}
+	/*
+	 * The list must be non-empty and the last character must be ';'.
+	 */
+	if (*lipp == NULL || lip != NULL)
+		goto fail;
+
+	return (0);
+
+fail:
+	free_props(*lipp);
+	*lipp = NULL;
+	return (-1);
+}
+
+static boolean_t
+process_prop_line(prop_db_state_t *lsp, char *buf,
+    dladm_status_t *statusp)
+{
+	prop_db_info_t		*lip = NULL;
+	int			i, len, llen;
+	char			*str, *lasts;
+	boolean_t		cont, noname = B_FALSE;
+
+	/*
+	 * Skip leading spaces, blank lines, and comments.
+	 */
+	len = strlen(buf);
+	for (i = 0; i < len; i++) {
+		if (!isspace(buf[i]))
+			break;
+	}
+	if (i == len || buf[i] == '#')
+		return (B_TRUE);
+
+	str = buf + i;
+	if (lsp->ls_name != NULL) {
+		/*
+		 * Skip names we're not interested in.
+		 * Note that strncmp() and isspace() are used here
+		 * instead of strtok() and strcmp() because we don't
+		 * want to modify buf in case it does not contain the
+		 * specified name.
+		 */
+		llen = strlen(lsp->ls_name);
+		if (strncmp(str, lsp->ls_name, llen) != 0 ||
+		    !isspace(str[llen]))
+			return (B_TRUE);
+	} else {
+		/*
+		 * If a name is not specified, find the name
+		 * and assign it to lsp->ls_name.
+		 */
+		if (strtok_r(str, " \n\t", &lasts) == NULL)
+			goto fail;
+
+		llen = strlen(str);
+		lsp->ls_name = str;
+		noname = B_TRUE;
+	}
+	str += llen + 1;
+	if (str >= buf + len)
+		goto fail;
+
+	/*
+	 * Now find the list of properties.
+	 */
+	if ((str = strtok_r(str, " \n\t", &lasts)) == NULL)
+		goto fail;
+
+	if (parse_props(str, &lip) < 0)
+		goto fail;
+
+	cont = (*lsp->ls_op)(lsp, buf, lip, statusp);
+	free_props(lip);
+	if (noname)
+		lsp->ls_name = NULL;
+	return (cont);
+
+fail:
+	free_props(lip);
+	if (noname)
+		lsp->ls_name = NULL;
+
+	/*
+	 * Delete corrupted line.
+	 */
+	buf[0] = '\0';
+	return (B_TRUE);
+}
+
+dladm_status_t
+process_prop_db(void *arg, FILE *fp, FILE *nfp)
+{
+	prop_db_state_t	*lsp = arg;
+	dladm_status_t		status = DLADM_STATUS_OK;
+	char			buf[MAXLINELEN];
+	boolean_t		cont = B_TRUE;
+
+	/*
+	 * This loop processes each line of the configuration file.
+	 * buf can potentially be modified by process_prop_line().
+	 * If this is a write operation and buf is not truncated, buf will
+	 * be written to disk. process_prop_line() will no longer be
+	 * called after it returns B_FALSE; at which point the remainder
+	 * of the file will continue to be read and, if necessary, written
+	 * to disk as well.
+	 */
+	while (fgets(buf, MAXLINELEN, fp) != NULL) {
+		if (cont)
+			cont = process_prop_line(lsp, buf, &status);
+
+		if (nfp != NULL && buf[0] != '\0' && fputs(buf, nfp) == EOF) {
+			status = dladm_errno2status(errno);
+			break;
+		}
+	}
+
+	if (status != DLADM_STATUS_OK || !cont)
+		return (status);
+
+	if (lsp->ls_op == process_prop_set) {
+		/*
+		 * If the specified name is not found above, we add the
+		 * name and its properties to the configuration file.
+		 */
+		(void) (*lsp->ls_op)(lsp, buf, NULL, &status);
+		if (status == DLADM_STATUS_OK && fputs(buf, nfp) == EOF)
+			status = dladm_errno2status(errno);
+	}
+
+	if (lsp->ls_op == process_prop_get)
+		status = DLADM_STATUS_NOTFOUND;
+
+	return (status);
+}
+
+dladm_status_t
+i_dladm_get_prop_temp(const char *name, prop_type_t type,
+    const char *prop_name, char **prop_val, uint_t *val_cntp,
+    prop_table_t *prop_tbl)
+{
+	int 		i;
+	dladm_status_t	status;
+	uint_t		cnt;
+	fprop_desc_t	*pdp;
+
+	if (name == NULL || prop_name == NULL || prop_val == NULL ||
+	    val_cntp == NULL || *val_cntp == 0)
+		return (DLADM_STATUS_BADARG);
+
+	for (i = 0; i < prop_tbl->pt_size; i++)
+		if (strcasecmp(prop_name, prop_tbl->pt_table[i].pd_name) == 0)
+			break;
+
+	if (i == prop_tbl->pt_size)
+		return (DLADM_STATUS_NOTFOUND);
+
+	pdp = &prop_tbl->pt_table[i];
+	status = DLADM_STATUS_OK;
+
+	switch (type) {
+	case DLADM_PROP_VAL_CURRENT:
+		status = pdp->pd_get(name, prop_val, val_cntp);
+		break;
+	case DLADM_PROP_VAL_DEFAULT:
+		if (pdp->pd_defval.vd_name == NULL) {
+			status = DLADM_STATUS_NOTSUP;
+			break;
+		}
+		(void) strcpy(*prop_val, pdp->pd_defval.vd_name);
+		*val_cntp = 1;
+		break;
+
+	case DLADM_PROP_VAL_MODIFIABLE:
+		if (pdp->pd_getmod != NULL) {
+			status = pdp->pd_getmod(name, prop_val, val_cntp);
+			break;
+		}
+		cnt = pdp->pd_nmodval;
+		if (cnt == 0) {
+			status = DLADM_STATUS_NOTSUP;
+		} else if (cnt > *val_cntp) {
+			status = DLADM_STATUS_TOOSMALL;
+		} else {
+			for (i = 0; i < cnt; i++) {
+				(void) strcpy(prop_val[i],
+				    pdp->pd_modval[i].vd_name);
+			}
+			*val_cntp = cnt;
+		}
+		break;
+	default:
+		status = DLADM_STATUS_BADARG;
+		break;
+	}
+
+	return (status);
+}
+
+static dladm_status_t
+i_dladm_set_one_prop_temp(const char *name, fprop_desc_t *pdp, char **prop_val,
+    uint_t val_cnt, uint_t flags)
+{
+	dladm_status_t	status;
+	val_desc_t	*vdp = NULL;
+	uint_t		cnt;
+
+	if (pdp->pd_temponly && (flags & DLADM_OPT_PERSIST) != 0)
+		return (DLADM_STATUS_TEMPONLY);
+
+	if (pdp->pd_set == NULL)
+		return (DLADM_STATUS_PROPRDONLY);
+
+	if (prop_val != NULL) {
+		if (pdp->pd_check != NULL)
+			status = pdp->pd_check(pdp, prop_val, val_cnt, &vdp);
+		else
+			status = DLADM_STATUS_BADARG;
+
+		if (status != DLADM_STATUS_OK)
+			return (status);
+
+		cnt = val_cnt;
+	} else {
+		if (pdp->pd_defval.vd_name == NULL)
+			return (DLADM_STATUS_NOTSUP);
+
+		if ((vdp = malloc(sizeof (val_desc_t))) == NULL)
+			return (DLADM_STATUS_NOMEM);
+
+		(void) memcpy(vdp, &pdp->pd_defval, sizeof (val_desc_t));
+		cnt = 1;
+	}
+
+	status = pdp->pd_set(name, vdp, cnt);
+
+	free(vdp);
+	return (status);
+}
+
+dladm_status_t
+i_dladm_set_prop_temp(const char *name, const char *prop_name, char **prop_val,
+    uint_t val_cnt, uint_t flags, char **errprop, prop_table_t *prop_tbl)
+{
+	int 		i;
+	dladm_status_t	status = DLADM_STATUS_OK;
+	boolean_t	found = B_FALSE;
+
+	for (i = 0; i < prop_tbl->pt_size; i++) {
+		fprop_desc_t	*pdp = &prop_tbl->pt_table[i];
+		dladm_status_t	s;
+
+		if (prop_name != NULL &&
+		    (strcasecmp(prop_name, pdp->pd_name) != 0))
+			continue;
+
+		found = B_TRUE;
+		s = i_dladm_set_one_prop_temp(name, pdp, prop_val, val_cnt,
+		    flags);
+
+		if (prop_name != NULL) {
+			status = s;
+			break;
+		} else {
+			if (s != DLADM_STATUS_OK &&
+			    s != DLADM_STATUS_NOTSUP) {
+				if (errprop != NULL)
+					*errprop = pdp->pd_name;
+				status = s;
+				break;
+			}
+		}
+	}
+
+	if (!found)
+		status = DLADM_STATUS_NOTFOUND;
+
+	return (status);
+}
+
+boolean_t
+i_dladm_is_prop_temponly(const char *prop_name, char **errprop,
+    prop_table_t *prop_tbl)
+{
+	int 		i;
+
+	if (prop_name == NULL)
+		return (B_FALSE);
+
+	for (i = 0; i < prop_tbl->pt_size; i++) {
+		fprop_desc_t	*pdp = &prop_tbl->pt_table[i];
+
+		if (strcasecmp(prop_name, pdp->pd_name) != 0)
+			continue;
+
+		if (errprop != NULL)
+			*errprop = pdp->pd_name;
+
+		if (pdp->pd_temponly)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+void
+dladm_free_props(dladm_arg_list_t *list)
+{
+	dladm_free_args(list);
+}
+
+dladm_status_t
+dladm_parse_props(char *str, dladm_arg_list_t **listp, boolean_t novalues)
+{
+	if (dladm_parse_args(str, listp, novalues) != DLADM_STATUS_OK)
+		goto fail;
+
+	return (DLADM_STATUS_OK);
+
+fail:
+	dladm_free_args(*listp);
+	return (DLADM_STATUS_PROP_PARSE_ERR);
+}
diff --git a/usr/src/lib/libdladm/common/usage.c b/usr/src/lib/libdladm/common/usage.c
new file mode 100644
index 0000000000..07ef7bbb22
--- /dev/null
+++ b/usr/src/lib/libdladm/common/usage.c
@@ -0,0 +1,1437 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <exacct.h>
+#include <libdladm.h>
+
+#define	TIMEBUFLEN	20
+#define	GBIT		1000000000
+#define	MBIT		1000000
+#define	KBIT		1000
+
+#define	NET_RESET_TOT(tbytes, ttime, tibytes, tobytes, step) {	\
+	(step) = 1;						\
+	(tbytes) = 0;						\
+	(ttime) = 0;						\
+	(tibytes) = 0;						\
+	(tobytes) = 0;						\
+	}
+
+/* Flow/Link Descriptor */
+typedef struct net_desc_s {
+	char		net_desc_name[LIFNAMSIZ];
+	char		net_desc_devname[LIFNAMSIZ];
+	uchar_t		net_desc_ehost[ETHERADDRL];
+	uchar_t		net_desc_edest[ETHERADDRL];
+	ushort_t	net_desc_vlan_tpid;
+	ushort_t	net_desc_vlan_tci;
+	ushort_t	net_desc_sap;
+	ushort_t	net_desc_cpuid;
+	ushort_t	net_desc_priority;
+	uint64_t	net_desc_bw_limit;
+	in6_addr_t	net_desc_saddr;
+	in6_addr_t	net_desc_daddr;
+	boolean_t	net_desc_isv4;
+	in_port_t	net_desc_sport;
+	in_port_t	net_desc_dport;
+	uint8_t		net_desc_protocol;
+	uint8_t		net_desc_dsfield;
+	boolean_t	net_desc_newrec;
+} net_desc_t;
+
+/* Time structure: Year, Month, Day, Hour, Min, Sec */
+typedef struct net_time_s {
+	int	net_time_yr;
+	int	net_time_mon;
+	int	net_time_day;
+	int	net_time_hr;
+	int	net_time_min;
+	int	net_time_sec;
+} net_time_t;
+
+/* Flow/Link Stats */
+typedef struct net_stat_s {
+	char			net_stat_name[LIFNAMSIZ];
+	uint64_t		net_stat_ibytes;
+	uint64_t		net_stat_obytes;
+	uint64_t		net_stat_ipackets;
+	uint64_t		net_stat_opackets;
+	uint64_t		net_stat_ierrors;
+	uint64_t		net_stat_oerrors;
+	uint64_t		net_stat_tibytes;
+	uint64_t		net_stat_tobytes;
+	uint64_t		net_stat_tipackets;
+	uint64_t		net_stat_topackets;
+	uint64_t		net_stat_tierrors;
+	uint64_t		net_stat_toerrors;
+	uint64_t		net_stat_ctime;
+	uint64_t		net_stat_tdiff;
+	net_time_t		net_stat_time;
+	struct net_stat_s	*net_stat_next;
+	net_desc_t		*net_stat_desc;
+	boolean_t		net_stat_isref;
+} net_stat_t;
+
+/* Used to create the [gnu]plot file */
+typedef struct net_plot_entry_s {
+	char		*net_pe_name;
+	uint64_t	net_pe_tottime;
+	uint64_t	net_pe_totbytes;
+	uint64_t	net_pe_totibytes;
+	uint64_t	net_pe_totobytes;
+	uint64_t	net_pe_lasttime;
+} net_plot_entry_t;
+
+/* Stats entry */
+typedef struct net_entry_s {
+	net_desc_t		*net_entry_desc;
+	net_stat_t		*net_entry_shead;
+	net_stat_t		*net_entry_stail;
+	int			net_entry_scount;
+	net_stat_t		*net_entry_sref;
+	net_stat_t		*net_entry_tstats;
+	uint64_t		net_entry_ttime;
+	struct net_entry_s	*net_entry_next;
+} net_entry_t;
+
+/* Time sorted list */
+typedef struct net_time_entry_s {
+	net_stat_t	*my_time_stat;
+	struct net_time_entry_s *net_time_entry_next;
+	struct net_time_entry_s *net_time_entry_prev;
+} net_time_entry_t;
+
+/* The parsed table */
+typedef	struct net_table_s {
+	/* List of stats */
+	net_entry_t		*net_table_head;
+	net_entry_t		*net_table_tail;
+	int			net_entries;
+
+	/*
+	 * Optimization I : List sorted by time, i.e:
+	 * Time		Resource	..
+	 * -------------------------------
+	 * 11.15.10	bge0
+	 * 11.15.10	ce0
+	 * 11.15.10	vnic1
+	 * 11.15.15	bge0
+	 * 11.15.15	ce0
+	 * 11.15.15	vnic1
+	 */
+	net_time_entry_t	*net_time_head;
+	net_time_entry_t	*net_time_tail;
+
+	/*
+	 * Optimization II : List sorted by resources
+	 * Time		Resource	..
+	 * -------------------------------
+	 * 11.15.10	bge0
+	 * 11.15.15	bge0
+	 * 11.15.10	ce0
+	 * 11.15.15	ce0
+	 * 11.15.10	vnic1
+	 * 11.15.15	vnic1
+	 */
+	net_time_entry_t	*net_ctime_head;
+	net_time_entry_t	*net_ctime_tail;
+
+	/* Common to both the above (sorted) lists. */
+	int			net_time_entries;
+} net_table_t;
+
+#define	NET_DATE_GREATER	0
+#define	NET_DATE_LESSER		1
+#define	NET_DATE_EQUAL		2
+
+#define	NET_TIME_GREATER	0
+#define	NET_TIME_LESSER		1
+#define	NET_TIME_EQUAL		2
+
+#ifndef _LP64
+#define	FMT_UINT64	"%-15llu"
+#else
+#define	FMT_UINT64	"%-15lu"
+#endif
+
+/*
+ * Given a timebuf of the form M/D/Y,H:M:S break it into individual elements.
+ */
+static void
+dissect_time(char *tbuf, net_time_t *nt)
+{
+	char	*d;
+	char	*t;
+	char	*dd;
+	char	*h;
+	char	*endp;
+
+	if (tbuf == NULL || nt == NULL)
+		return;
+
+	d = strtok(tbuf, ",");	/* Date */
+	t = strtok(NULL, ",");	/* Time */
+
+	/* Month */
+	dd = strtok(d, "/");
+	if (dd == NULL)
+		return;
+	nt->net_time_mon = strtol(dd, &endp, 10);
+
+	/* Day */
+	dd = strtok(NULL, "/");
+	if (dd == NULL)
+		return;
+	nt->net_time_day = strtol(dd, &endp, 10);
+
+	/* Year */
+	dd = strtok(NULL, "/");
+	if (dd == NULL)
+		return;
+	nt->net_time_yr = strtol(dd, &endp, 10);
+	if (strlen(dd) <= 2)
+		nt->net_time_yr += 2000;
+
+	if (t == NULL)
+		return;
+
+	/* Hour */
+	h = strtok(t, ":");
+	if (h == NULL)
+		return;
+	nt->net_time_hr = strtol(h, &endp, 10);
+
+	/* Min */
+	h = strtok(NULL, ":");
+	if (h == NULL)
+		return;
+	nt->net_time_min = strtol(h, &endp, 10);
+
+	/* Sec */
+	h = strtok(NULL, ":");
+	if (h == NULL)
+		return;
+	nt->net_time_sec = strtol(h, &endp, 10);
+}
+
+/* Get a stat item from an object in the exacct file */
+static void
+add_stat_item(ea_object_t *o, net_stat_t *ns)
+{
+	switch (o->eo_catalog & EXT_TYPE_MASK) {
+	case EXT_STRING:
+		if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_STATS_NAME) {
+			(void) strncpy(ns->net_stat_name, o->eo_item.ei_string,
+			    strlen(o->eo_item.ei_string));
+		}
+		break;
+	case EXT_UINT64:
+		if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_STATS_CURTIME) {
+			time_t	_time;
+			char	timebuf[TIMEBUFLEN];
+
+			ns->net_stat_ctime = o->eo_item.ei_uint64;
+			_time = ns->net_stat_ctime;
+			(void) strftime(timebuf, sizeof (timebuf),
+			    "%m/%d/%Y,%T\n", localtime(&_time));
+			dissect_time(timebuf, &ns->net_stat_time);
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_STATS_IBYTES) {
+			ns->net_stat_ibytes = o->eo_item.ei_uint64;
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_STATS_OBYTES) {
+			ns->net_stat_obytes = o->eo_item.ei_uint64;
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_STATS_IPKTS) {
+			ns->net_stat_ipackets = o->eo_item.ei_uint64;
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_STATS_OPKTS) {
+			ns->net_stat_opackets = o->eo_item.ei_uint64;
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_STATS_IERRPKTS) {
+			ns->net_stat_ierrors = o->eo_item.ei_uint64;
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_STATS_OERRPKTS) {
+			ns->net_stat_oerrors = o->eo_item.ei_uint64;
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+/* Get a description item from an object in the exacct file */
+static void
+add_desc_item(ea_object_t *o, net_desc_t *nd)
+{
+	switch (o->eo_catalog & EXT_TYPE_MASK) {
+	case EXT_STRING:
+		if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_NAME) {
+			(void) strncpy(nd->net_desc_name, o->eo_item.ei_string,
+			    strlen(o->eo_item.ei_string));
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_DESC_DEVNAME) {
+			(void) strncpy(nd->net_desc_devname,
+			    o->eo_item.ei_string, strlen(o->eo_item.ei_string));
+		}
+		break;
+	case EXT_UINT8:
+		if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_PROTOCOL) {
+			nd->net_desc_protocol = o->eo_item.ei_uint8;
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_DESC_DSFIELD) {
+			nd->net_desc_dsfield = o->eo_item.ei_uint8;
+		}
+		break;
+	case EXT_UINT16:
+		if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_SPORT) {
+			nd->net_desc_sport = o->eo_item.ei_uint16;
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_DESC_DPORT) {
+			nd->net_desc_dport = o->eo_item.ei_uint16;
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_DESC_SAP) {
+			nd->net_desc_sap = o->eo_item.ei_uint16;
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_DESC_VLAN_TPID) {
+			nd->net_desc_vlan_tpid = o->eo_item.ei_uint16;
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_DESC_VLAN_TCI) {
+			nd->net_desc_vlan_tci = o->eo_item.ei_uint16;
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_DESC_PRIORITY) {
+			nd->net_desc_priority = o->eo_item.ei_uint16;
+		}
+		break;
+	case EXT_UINT32:
+		if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_V4SADDR ||
+		    (o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_V4DADDR) {
+				struct in_addr	addr;
+
+				addr.s_addr = htonl(o->eo_item.ei_uint32);
+
+				if ((o->eo_catalog & EXD_DATA_MASK) ==
+				    EXD_NET_DESC_V4SADDR) {
+					IN6_INADDR_TO_V4MAPPED(&addr,
+					    &nd->net_desc_saddr);
+				} else {
+					IN6_INADDR_TO_V4MAPPED(&addr,
+					    &nd->net_desc_daddr);
+				}
+		}
+		break;
+	case EXT_UINT64:
+		if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_BWLIMIT)
+			nd->net_desc_bw_limit = o->eo_item.ei_uint64;
+		break;
+	case EXT_RAW:
+		if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_V6SADDR ||
+		    (o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_V6DADDR) {
+			in6_addr_t	addr;
+
+			addr = *(in6_addr_t *)o->eo_item.ei_raw;
+			if ((o->eo_catalog & EXD_DATA_MASK) ==
+			    EXD_NET_DESC_V6SADDR) {
+				nd->net_desc_saddr = addr;
+			} else {
+				nd->net_desc_daddr = addr;
+			}
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_DESC_EHOST) {
+			bcopy((uchar_t *)o->eo_item.ei_raw, nd->net_desc_ehost,
+			    ETHERADDRL);
+		} else if ((o->eo_catalog & EXD_DATA_MASK) ==
+		    EXD_NET_DESC_EDEST) {
+			bcopy((uchar_t *)o->eo_item.ei_raw, nd->net_desc_edest,
+			    ETHERADDRL);
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+/* Add a description item to the table */
+static dladm_status_t
+add_desc_to_tbl(net_table_t *net_table, net_desc_t *nd)
+{
+	net_entry_t	*ne;
+
+	if ((ne = calloc(1, sizeof (net_entry_t))) == NULL)
+		return (DLADM_STATUS_NOMEM);
+
+	if ((ne->net_entry_tstats = calloc(1, sizeof (net_stat_t))) == NULL) {
+		free(ne);
+		return (DLADM_STATUS_NOMEM);
+	}
+
+	ne->net_entry_desc = nd;
+	ne->net_entry_shead = NULL;
+	ne->net_entry_stail = NULL;
+	ne->net_entry_scount = 0;
+
+	if (net_table->net_table_head == NULL) {
+		net_table->net_table_head = ne;
+		net_table->net_table_tail = ne;
+	} else {
+		net_table->net_table_tail->net_entry_next = ne;
+		net_table->net_table_tail = ne;
+	}
+	net_table->net_entries++;
+	return (DLADM_STATUS_OK);
+}
+
+/* Compare dates and return if t1 is equal, greater or lesser than t2 */
+static int
+compare_date(net_time_t *t1, net_time_t *t2)
+{
+	if (t1->net_time_yr == t2->net_time_yr &&
+	    t1->net_time_mon == t2->net_time_mon &&
+	    t1->net_time_day == t2->net_time_day) {
+		return (NET_DATE_EQUAL);
+	}
+	if (t1->net_time_yr > t2->net_time_yr ||
+	    (t1->net_time_yr == t2->net_time_yr &&
+	    t1->net_time_mon > t2->net_time_mon) ||
+	    (t1->net_time_yr == t2->net_time_yr &&
+	    t1->net_time_mon == t2->net_time_mon &&
+	    t1->net_time_day > t2->net_time_day)) {
+		return (NET_DATE_GREATER);
+	}
+	return (NET_DATE_LESSER);
+}
+
+/* Compare times and return if t1 is equal, greater or lesser than t2 */
+static int
+compare_time(net_time_t *t1, net_time_t *t2)
+{
+	int	cd;
+
+	cd = compare_date(t1, t2);
+
+	if (cd == NET_DATE_GREATER) {
+		return (NET_TIME_GREATER);
+	} else if (cd == NET_DATE_LESSER) {
+		return (NET_TIME_LESSER);
+	} else {
+		if (t1->net_time_hr == t2->net_time_hr &&
+		    t1->net_time_min == t2->net_time_min &&
+		    t1->net_time_sec == t2->net_time_sec) {
+			return (NET_TIME_EQUAL);
+		}
+		if (t1->net_time_hr > t2->net_time_hr ||
+		    (t1->net_time_hr == t2->net_time_hr &&
+		    t1->net_time_min > t2->net_time_min) ||
+		    (t1->net_time_hr == t2->net_time_hr &&
+		    t1->net_time_min == t2->net_time_min &&
+		    t1->net_time_sec > t2->net_time_sec)) {
+			return (NET_TIME_GREATER);
+		}
+	}
+	return (NET_TIME_LESSER);
+}
+
+/*
+ * Given a start and end time and start and end entries check if the
+ * times are within the range, and adjust, if needed.
+ */
+static dladm_status_t
+chk_time_bound(net_time_t *s, net_time_t *e,  net_time_t *sns,
+    net_time_t *ens)
+{
+	if (s != NULL && e != NULL) {
+		if (compare_time(s, e) == NET_TIME_GREATER)
+			return (DLADM_STATUS_BADTIMEVAL);
+	}
+	if (s != NULL) {
+		if (compare_time(s, sns) == NET_TIME_LESSER) {
+			s->net_time_yr = sns->net_time_yr;
+			s->net_time_mon = sns->net_time_mon;
+			s->net_time_day = sns->net_time_day;
+			s->net_time_hr = sns->net_time_hr;
+			s->net_time_min = sns->net_time_min;
+			s->net_time_sec = sns->net_time_sec;
+		}
+	}
+	if (e != NULL) {
+		if (compare_time(e, ens) == NET_TIME_GREATER) {
+			e->net_time_yr = ens->net_time_yr;
+			e->net_time_mon = ens->net_time_mon;
+			e->net_time_day = ens->net_time_day;
+			e->net_time_hr = ens->net_time_hr;
+			e->net_time_min = ens->net_time_min;
+			e->net_time_sec = ens->net_time_sec;
+		}
+	}
+	return (DLADM_STATUS_OK);
+}
+
+/*
+ * Given a start and end time (strings), convert them into net_time_t
+ * and also check for the range given the head and tail of the list.
+ * If stime is lower then head or etime is greated than tail, adjust.
+ */
+static dladm_status_t
+get_time_range(net_time_entry_t *head, net_time_entry_t *tail,
+    net_time_t *st, net_time_t *et, char *stime, char *etime)
+{
+	bzero(st, sizeof (net_time_t));
+	bzero(et, sizeof (net_time_t));
+
+	if (stime == NULL && etime == NULL)
+		return (0);
+
+	if (stime != NULL)
+		dissect_time(stime, st);
+	if (etime != NULL)
+		dissect_time(etime, et);
+
+	if (stime != NULL || etime != NULL) {
+		return (chk_time_bound(stime == NULL ? NULL : st,
+		    etime == NULL ? NULL : et,
+		    &head->my_time_stat->net_stat_time,
+		    &tail->my_time_stat->net_stat_time));
+	}
+	return (0);
+}
+
+/*
+ * Walk the list from a given starting point and return when we find
+ * an entry that is greater or equal to st. lasttime will point to the
+ * previous time entry.
+ */
+static void
+get_starting_point(net_time_entry_t *head, net_time_entry_t **start,
+    net_time_t *st, char *stime, uint64_t *lasttime)
+{
+	net_time_entry_t	*next = head;
+
+	if (head == NULL) {
+		*start = NULL;
+		return;
+	}
+	if (stime == NULL) {
+		*start = head;
+		*lasttime = head->my_time_stat->net_stat_ctime;
+		return;
+	}
+	*start = NULL;
+	while (next != NULL) {
+		if (compare_time(st,
+		    &next->my_time_stat->net_stat_time) != NET_TIME_LESSER) {
+			*lasttime = next->my_time_stat->net_stat_ctime;
+			next = next->net_time_entry_next;
+			continue;
+		}
+		*start = next;
+		break;
+	}
+}
+
+/*
+ * Point entry (pe) functions
+ */
+/* Clear all the counters. Done after the contents are written to the file */
+static void
+clear_pe(net_plot_entry_t *pe, int entries, int *pentries)
+{
+	int	count;
+
+	for (count = 0; count < entries; count++) {
+		pe[count].net_pe_totbytes = 0;
+		pe[count].net_pe_totibytes = 0;
+		pe[count].net_pe_totobytes = 0;
+		pe[count].net_pe_tottime = 0;
+	}
+	*pentries = 0;
+}
+
+/* Update an entry in the point entry table */
+static void
+update_pe(net_plot_entry_t *pe, net_stat_t *nns, int nentries,
+    int *pentries, uint64_t lasttime)
+{
+	int	count;
+
+	for (count = 0; count < nentries; count++) {
+		if ((strlen(nns->net_stat_name) ==
+		    strlen(pe[count].net_pe_name)) &&
+		    (strncmp(pe[count].net_pe_name, nns->net_stat_name,
+		    strlen(nns->net_stat_name)) == 0)) {
+			break;
+		}
+	}
+	if (count == nentries)
+		return;
+
+	if (pe[count].net_pe_totbytes == 0)
+		pe[count].net_pe_lasttime = lasttime;
+
+	pe[count].net_pe_totbytes += nns->net_stat_ibytes +
+	    nns->net_stat_obytes;
+	pe[count].net_pe_tottime += nns->net_stat_tdiff;
+	pe[count].net_pe_totibytes += nns->net_stat_ibytes;
+	pe[count].net_pe_totobytes += nns->net_stat_obytes;
+	(*pentries)++;
+}
+
+/* Flush the contents of the point entry table to the file. */
+static void
+add_pe_to_file(int (*fn)(dladm_usage_t *, void *), net_plot_entry_t *pe,
+    net_stat_t *ns, int entries, void *arg)
+{
+	int		count;
+	dladm_usage_t	usage;
+	uint64_t	tottime;
+
+	bcopy(&ns->net_stat_ctime, &usage.du_etime, sizeof (usage.du_etime));
+	for (count = 0; count < entries; count++) {
+		bcopy(pe[count].net_pe_name, &usage.du_name,
+		    sizeof (usage.du_name));
+		bcopy(&pe[count].net_pe_lasttime, &usage.du_stime,
+		    sizeof (usage.du_stime));
+		usage.du_rbytes = pe[count].net_pe_totibytes;
+		usage.du_obytes = pe[count].net_pe_totobytes;
+		tottime = pe[count].net_pe_tottime;
+		usage.du_bandwidth = (tottime > 0) ?
+		    ((pe[count].net_pe_totbytes * 8) / tottime) : 0;
+		usage.du_last = (count == entries-1);
+		fn(&usage, arg);
+	}
+}
+
+/*
+ * Net entry functions
+ */
+static net_entry_t *
+get_ne_from_table(net_table_t *net_table, char *name)
+{
+	int		count;
+	net_desc_t	*nd;
+	net_entry_t	*ne = net_table->net_table_head;
+
+	for (count = 0; count < net_table->net_entries; count++) {
+		nd = ne->net_entry_desc;
+		if ((strlen(name) == strlen(nd->net_desc_name)) &&
+		    (strncmp(name, nd->net_desc_name, strlen(name)) == 0)) {
+			return (ne);
+		}
+		ne = ne->net_entry_next;
+	}
+	return (NULL);
+}
+
+/*  Get the entry for the descriptor, if it exists */
+static net_desc_t *
+get_ndesc(net_table_t *net_table, net_desc_t *nd)
+{
+	int		count;
+	net_desc_t	*nd1;
+	net_entry_t	*ne = net_table->net_table_head;
+
+	for (count = 0; count < net_table->net_entries; count++) {
+		nd1 = ne->net_entry_desc;
+		if (strlen(nd1->net_desc_name) == strlen(nd->net_desc_name) &&
+		    strlen(nd1->net_desc_devname) ==
+		    strlen(nd->net_desc_devname) &&
+		    strncmp(nd1->net_desc_name, nd->net_desc_name,
+		    strlen(nd1->net_desc_name)) == 0 &&
+		    strncmp(nd1->net_desc_devname, nd->net_desc_devname,
+		    strlen(nd1->net_desc_devname)) == 0 &&
+		    bcmp(nd1->net_desc_ehost, nd->net_desc_ehost,
+		    ETHERADDRL) == 0 &&
+		    bcmp(nd1->net_desc_edest, nd->net_desc_edest,
+		    ETHERADDRL) == 0 &&
+		    nd1->net_desc_vlan_tpid == nd->net_desc_vlan_tpid &&
+		    nd1->net_desc_vlan_tci == nd->net_desc_vlan_tci &&
+		    nd1->net_desc_sap == nd->net_desc_sap &&
+		    nd1->net_desc_cpuid == nd->net_desc_cpuid &&
+		    nd1->net_desc_priority == nd->net_desc_priority &&
+		    nd1->net_desc_bw_limit == nd->net_desc_bw_limit &&
+		    nd1->net_desc_sport == nd->net_desc_sport &&
+		    nd1->net_desc_dport == nd->net_desc_dport &&
+		    nd1->net_desc_protocol == nd->net_desc_protocol &&
+		    nd1->net_desc_dsfield == nd->net_desc_dsfield &&
+		    IN6_ARE_ADDR_EQUAL(&nd1->net_desc_saddr,
+		    &nd->net_desc_saddr) &&
+		    IN6_ARE_ADDR_EQUAL(&nd1->net_desc_daddr,
+		    &nd->net_desc_daddr)) {
+			return (nd1);
+		}
+		ne = ne->net_entry_next;
+	}
+	return (NULL);
+}
+
+/*
+ * Update the stat entries. The stats in the file are cumulative, so in order
+ * to have increments, we maintain a reference stat entry, which contains
+ * the stats when the record was first written and a total stat entry, which
+ * maintains the running count. When we want to add a stat entry, if it
+ * the reference stat entry, we don't come here. For subsequent entries,
+ * we get the increment by subtracting the current value from the reference
+ * stat and the total stat.
+ */
+static void
+update_stats(net_stat_t *ns1, net_entry_t *ne, net_stat_t *ref)
+{
+
+	/* get the increment */
+	ns1->net_stat_ibytes -= (ref->net_stat_ibytes + ref->net_stat_tibytes);
+	ns1->net_stat_obytes -= (ref->net_stat_obytes + ref->net_stat_tobytes);
+	ns1->net_stat_ipackets -= (ref->net_stat_ipackets +
+	    ref->net_stat_tipackets);
+	ns1->net_stat_opackets -= (ref->net_stat_opackets +
+	    ref->net_stat_topackets);
+	ns1->net_stat_ierrors -= (ref->net_stat_ierrors +
+	    ref->net_stat_tierrors);
+	ns1->net_stat_oerrors -= (ref->net_stat_oerrors +
+	    ref->net_stat_toerrors);
+
+	/* update total bytes */
+	ref->net_stat_tibytes += ns1->net_stat_ibytes;
+	ref->net_stat_tobytes += ns1->net_stat_obytes;
+	ref->net_stat_tipackets += ns1->net_stat_ipackets;
+	ref->net_stat_topackets += ns1->net_stat_opackets;
+	ref->net_stat_tierrors += ns1->net_stat_ierrors;
+	ref->net_stat_toerrors  += ns1->net_stat_oerrors;
+
+	ne->net_entry_tstats->net_stat_ibytes += ns1->net_stat_ibytes;
+	ne->net_entry_tstats->net_stat_obytes += ns1->net_stat_obytes;
+	ne->net_entry_tstats->net_stat_ipackets += ns1->net_stat_ipackets;
+	ne->net_entry_tstats->net_stat_opackets += ns1->net_stat_opackets;
+	ne->net_entry_tstats->net_stat_ierrors += ns1->net_stat_ierrors;
+	ne->net_entry_tstats->net_stat_oerrors += ns1->net_stat_oerrors;
+}
+
+/* Add the stat entry into the table */
+static dladm_status_t
+add_stat_to_tbl(net_table_t *net_table, net_stat_t *ns)
+{
+	net_entry_t	*ne;
+
+	ne = get_ne_from_table(net_table, ns->net_stat_name);
+	if (ne == NULL)
+		return (DLADM_STATUS_NOMEM);
+
+	/* Ptr to flow desc */
+	ns->net_stat_desc = ne->net_entry_desc;
+	if (ns->net_stat_desc->net_desc_newrec) {
+		ns->net_stat_desc->net_desc_newrec = B_FALSE;
+		ns->net_stat_isref = B_TRUE;
+		ne->net_entry_sref = ns;
+	} else if (ns->net_stat_ibytes < ne->net_entry_sref->net_stat_tibytes ||
+	    (ns->net_stat_obytes < ne->net_entry_sref->net_stat_tobytes)) {
+		ns->net_stat_isref = B_TRUE;
+		ne->net_entry_sref = ns;
+	} else {
+		ns->net_stat_isref = B_FALSE;
+		update_stats(ns, ne, ne->net_entry_sref);
+	}
+	if (ne->net_entry_shead == NULL) {
+		ne->net_entry_shead = ns;
+		ne->net_entry_stail = ns;
+	} else {
+		if (!ns->net_stat_isref) {
+			ne->net_entry_ttime += (ns->net_stat_ctime -
+			    ne->net_entry_stail->net_stat_ctime);
+			ns->net_stat_tdiff = ns->net_stat_ctime -
+			    ne->net_entry_stail->net_stat_ctime;
+		}
+		ne->net_entry_stail->net_stat_next = ns;
+		ne->net_entry_stail = ns;
+	}
+
+	ne->net_entry_scount++;
+	return (DLADM_STATUS_OK);
+}
+
+/* Add a flow/link descriptor record to the table */
+static dladm_status_t
+add_desc(net_table_t *net_table, ea_file_t *ef, int nobjs)
+{
+	net_desc_t	*nd;
+	net_desc_t	*dnd;
+	int		count;
+	ea_object_t	scratch;
+
+	if ((nd = calloc(1, sizeof (net_desc_t))) == NULL)
+		return (DLADM_STATUS_NOMEM);
+	nd->net_desc_newrec = B_TRUE;
+
+	for (count = 0; count < nobjs; count++) {
+		if (ea_get_object(ef, &scratch) == -1) {
+			free(nd);
+			return (DLADM_STATUS_NOMEM);
+		}
+		add_desc_item(&scratch, nd);
+	}
+	if ((dnd = get_ndesc(net_table, nd)) != NULL) {
+		dnd->net_desc_newrec = B_TRUE;
+		free(nd);
+		return (DLADM_STATUS_OK);
+	}
+	if (add_desc_to_tbl(net_table, nd) != 0) {
+		free(nd);
+		return (DLADM_STATUS_NOMEM);
+	}
+	return (DLADM_STATUS_OK);
+}
+
+/* Make an entry into the time sorted list */
+static void
+addto_time_list(net_table_t *net_table, net_time_entry_t *nt,
+    net_time_entry_t *ntc)
+{
+	net_stat_t		*ns = nt->my_time_stat;
+	net_stat_t		*ns1;
+	net_time_entry_t	*end;
+	net_time_t		*t1;
+	int			count;
+
+	t1 = &ns->net_stat_time;
+
+	net_table->net_time_entries++;
+
+	if (net_table->net_time_head == NULL) {
+		net_table->net_time_head = nt;
+		net_table->net_time_tail = nt;
+	} else {
+		net_table->net_time_tail->net_time_entry_next = nt;
+		nt->net_time_entry_prev = net_table->net_time_tail;
+		net_table->net_time_tail = nt;
+	}
+
+	if (net_table->net_ctime_head == NULL) {
+		net_table->net_ctime_head = ntc;
+		net_table->net_ctime_tail = ntc;
+	} else {
+		end = net_table->net_ctime_tail;
+		count = 0;
+		while (count < net_table->net_time_entries - 1) {
+			ns1 = end->my_time_stat;
+			/* Just add it to the tail */
+			if (compare_date(t1, &ns1->net_stat_time) ==
+			    NET_DATE_GREATER) {
+				break;
+			}
+			if ((strlen(ns1->net_stat_name) ==
+			    strlen(ns->net_stat_name)) &&
+			    (strncmp(ns1->net_stat_name, ns->net_stat_name,
+			    strlen(ns1->net_stat_name)) == 0)) {
+				ntc->net_time_entry_next =
+				    end->net_time_entry_next;
+				if (end->net_time_entry_next != NULL) {
+					end->net_time_entry_next->
+					    net_time_entry_prev = ntc;
+				} else {
+					net_table->net_ctime_tail = ntc;
+				}
+				end->net_time_entry_next = ntc;
+				ntc->net_time_entry_prev = end;
+				return;
+			}
+			count++;
+			end = end->net_time_entry_prev;
+		}
+		net_table->net_ctime_tail->net_time_entry_next = ntc;
+		ntc->net_time_entry_prev = net_table->net_ctime_tail;
+		net_table->net_ctime_tail = ntc;
+	}
+}
+
+/* Add stat entry into the lists */
+static dladm_status_t
+add_stats(net_table_t *net_table, ea_file_t *ef, int nobjs)
+{
+	net_stat_t		*ns;
+	int			count;
+	ea_object_t		scratch;
+	net_time_entry_t	*nt;
+	net_time_entry_t	*ntc;
+
+	if ((ns = calloc(1, sizeof (net_stat_t))) == NULL)
+		return (DLADM_STATUS_NOMEM);
+
+	if ((nt = calloc(1, sizeof (net_time_entry_t))) == NULL) {
+		free(ns);
+		return (DLADM_STATUS_NOMEM);
+	}
+	if ((ntc = calloc(1, sizeof (net_time_entry_t))) == NULL) {
+		free(ns);
+		free(nt);
+		return (DLADM_STATUS_NOMEM);
+	}
+
+	nt->my_time_stat = ns;
+	ntc->my_time_stat = ns;
+
+	for (count = 0; count < nobjs; count++) {
+		if (ea_get_object(ef, &scratch) == -1) {
+			free(ns);
+			free(nt);
+			free(ntc);
+			return (DLADM_STATUS_NOMEM);
+		}
+		add_stat_item(&scratch, ns);
+	}
+	if (add_stat_to_tbl(net_table, ns) != 0) {
+		free(ns);
+		free(nt);
+		free(ntc);
+		return (DLADM_STATUS_NOMEM);
+	}
+	addto_time_list(net_table, nt, ntc);
+	return (DLADM_STATUS_OK);
+}
+
+/* Free the entire table */
+static void
+free_logtable(net_table_t *net_table)
+{
+	net_entry_t		*head;
+	net_entry_t		*next;
+	net_stat_t		*ns;
+	net_stat_t		*ns1;
+	net_time_entry_t	*thead;
+	net_time_entry_t	*tnext;
+
+	thead = net_table->net_time_head;
+	while (thead != NULL) {
+		thead->my_time_stat = NULL;
+		tnext = thead->net_time_entry_next;
+		thead->net_time_entry_next = NULL;
+		thead->net_time_entry_prev = NULL;
+		free(thead);
+		thead = tnext;
+	}
+	net_table->net_time_head = NULL;
+	net_table->net_time_tail = NULL;
+
+	thead = net_table->net_ctime_head;
+	while (thead != NULL) {
+		thead->my_time_stat = NULL;
+		tnext = thead->net_time_entry_next;
+		thead->net_time_entry_next = NULL;
+		thead->net_time_entry_prev = NULL;
+		free(thead);
+		thead = tnext;
+	}
+	net_table->net_ctime_head = NULL;
+	net_table->net_ctime_tail = NULL;
+
+	net_table->net_time_entries = 0;
+
+	head = net_table->net_table_head;
+	while (head != NULL) {
+		next = head->net_entry_next;
+		head->net_entry_next = NULL;
+		ns = head->net_entry_shead;
+		while (ns != NULL) {
+			ns1 = ns->net_stat_next;
+			free(ns);
+			ns = ns1;
+		}
+		head->net_entry_scount = 0;
+		head->net_entry_sref = NULL;
+		free(head->net_entry_desc);
+		free(head->net_entry_tstats);
+		free(head);
+		head = next;
+	}
+	net_table->net_table_head = NULL;
+	net_table->net_table_tail = NULL;
+	net_table->net_time_entries = 0;
+	free(net_table);
+}
+
+/* Parse the exacct file, and return the parsed table. */
+static void *
+parse_logfile(char *file, int logtype, dladm_status_t *status)
+{
+	ea_file_t	ef;
+	ea_object_t	scratch;
+	net_table_t	*net_table;
+
+	*status = DLADM_STATUS_OK;
+	if ((net_table = calloc(1, sizeof (net_table_t))) == NULL) {
+		*status = DLADM_STATUS_NOMEM;
+		return (NULL);
+	}
+	if (ea_open(&ef, file, NULL, 0, O_RDONLY, 0) == -1) {
+		*status = DLADM_STATUS_BADARG;
+		free(net_table);
+		return (NULL);
+	}
+	bzero(&scratch, sizeof (ea_object_t));
+	while (ea_get_object(&ef, &scratch) != -1) {
+		if (scratch.eo_type != EO_GROUP) {
+			(void) ea_free_item(&scratch, EUP_ALLOC);
+			(void) bzero(&scratch, sizeof (ea_object_t));
+			continue;
+		}
+		/* Read Link Desc/Stat records */
+		if (logtype == DLADM_LOGTYPE_FLOW) {
+			/* Flow Descriptor */
+			if ((scratch.eo_catalog &
+			    EXD_DATA_MASK) == EXD_GROUP_NET_FLOW_DESC) {
+				(void) add_desc(net_table, &ef,
+				    scratch.eo_group.eg_nobjs - 1);
+			/* Flow Stats */
+			} else if ((scratch.eo_catalog &
+			    EXD_DATA_MASK) == EXD_GROUP_NET_FLOW_STATS) {
+				(void) add_stats(net_table, &ef,
+				    scratch.eo_group.eg_nobjs - 1);
+			}
+		} else if (logtype == DLADM_LOGTYPE_LINK) {
+			/* Link Descriptor */
+			if ((scratch.eo_catalog &
+			    EXD_DATA_MASK) == EXD_GROUP_NET_LINK_DESC) {
+				(void) add_desc(net_table, &ef,
+				    scratch.eo_group.eg_nobjs - 1);
+			/* Link Stats */
+			} else if ((scratch.eo_catalog &
+			    EXD_DATA_MASK) == EXD_GROUP_NET_LINK_STATS) {
+				(void) add_stats(net_table, &ef,
+				    scratch.eo_group.eg_nobjs - 1);
+			}
+		} else {
+			if (((scratch.eo_catalog & EXD_DATA_MASK) ==
+			    EXD_GROUP_NET_LINK_DESC) || ((scratch.eo_catalog &
+			    EXD_DATA_MASK) == EXD_GROUP_NET_FLOW_DESC)) {
+				(void) add_desc(net_table, &ef,
+				    scratch.eo_group.eg_nobjs - 1);
+			} else if (((scratch.eo_catalog & EXD_DATA_MASK) ==
+			    EXD_GROUP_NET_LINK_STATS) || ((scratch.eo_catalog &
+			    EXD_DATA_MASK) == EXD_GROUP_NET_FLOW_STATS)) {
+				(void) add_stats(net_table, &ef,
+				    scratch.eo_group.eg_nobjs - 1);
+			}
+		}
+		(void) ea_free_item(&scratch, EUP_ALLOC);
+		(void) bzero(&scratch, sizeof (ea_object_t));
+	}
+
+	(void) ea_close(&ef);
+	return ((void *)net_table);
+}
+
+/*
+ * Walk the ctime list.  This is used when looking for usage records
+ * based on a "resource" name.
+ */
+dladm_status_t
+dladm_walk_usage_res(int (*fn)(dladm_usage_t *, void *), int logtype,
+    char *logfile, char *resource, char *stime, char *etime, void *arg)
+{
+	net_table_t		*net_table;
+	net_time_t		st, et;
+	net_time_entry_t	*start;
+	net_stat_t		*ns = NULL;
+	net_stat_t		*nns;
+	uint64_t		tot_time = 0;
+	uint64_t		last_time;
+	uint64_t		tot_bytes = 0;
+	uint64_t		tot_ibytes = 0;
+	uint64_t		tot_obytes = 0;
+	boolean_t		gotstart = B_FALSE;
+	dladm_status_t		status;
+	dladm_usage_t		usage;
+	int			step = 1;
+
+	/* Parse the log file */
+	net_table = parse_logfile(logfile, logtype, &status);
+	if (net_table == NULL)
+		return (status);
+
+	if (net_table->net_entries == 0)
+		return (DLADM_STATUS_OK);
+	start = net_table->net_ctime_head;
+
+	/* Time range */
+	status = get_time_range(net_table->net_ctime_head,
+	    net_table->net_ctime_tail, &st, &et, stime, etime);
+	if (status != DLADM_STATUS_OK)
+		return (status);
+
+	while (start != NULL) {
+		nns = start->my_time_stat;
+
+		/* Get to the resource we are interested in */
+		if ((strlen(resource) != strlen(nns->net_stat_name)) ||
+		    (strncmp(resource, nns->net_stat_name,
+		    strlen(nns->net_stat_name)) != 0)) {
+			start = start->net_time_entry_next;
+			continue;
+		}
+
+		/* Find the first record */
+		if (!gotstart) {
+			get_starting_point(start, &start, &st, stime,
+			    &last_time);
+			if (start == NULL)
+				break;
+			nns = start->my_time_stat;
+			gotstart = B_TRUE;
+		}
+
+		/* Write one entry and return if we are out of the range */
+		if (etime != NULL && compare_time(&nns->net_stat_time, &et)
+		    == NET_TIME_GREATER) {
+			if (tot_bytes != 0) {
+				bcopy(ns->net_stat_name, &usage.du_name,
+				    sizeof (usage.du_name));
+				bcopy(&last_time, &usage.du_stime,
+				    sizeof (usage.du_stime));
+				bcopy(&ns->net_stat_ctime, &usage.du_etime,
+				    sizeof (usage.du_etime));
+				usage.du_rbytes = tot_ibytes;
+				usage.du_obytes = tot_obytes;
+				usage.du_bandwidth = tot_bytes*8/tot_time;
+				usage.du_last = B_TRUE;
+				fn(&usage, arg);
+			}
+			return (DLADM_STATUS_OK);
+		}
+
+		/*
+		 * If this is a reference entry, just print what we have
+		 * and proceed.
+		 */
+		if (nns->net_stat_isref) {
+			if (tot_bytes != 0) {
+				bcopy(&nns->net_stat_name, &usage.du_name,
+				    sizeof (usage.du_name));
+				bcopy(&nns->net_stat_ctime, &usage.du_stime,
+				    sizeof (usage.du_stime));
+				usage.du_rbytes = tot_ibytes;
+				usage.du_obytes = tot_obytes;
+				usage.du_bandwidth = tot_bytes*8/tot_time;
+				usage.du_last = B_TRUE;
+				fn(&usage, arg);
+				NET_RESET_TOT(tot_bytes, tot_time, tot_ibytes,
+				    tot_obytes, step);
+			}
+			last_time = nns->net_stat_ctime;
+			start = start->net_time_entry_next;
+			continue;
+		}
+
+		ns = nns;
+		if (--step == 0) {
+			tot_bytes += ns->net_stat_ibytes + ns->net_stat_obytes;
+			tot_ibytes += ns->net_stat_ibytes;
+			tot_obytes += ns->net_stat_obytes;
+			tot_time += ns->net_stat_tdiff;
+			bcopy(&ns->net_stat_name, &usage.du_name,
+			    sizeof (usage.du_name));
+			bcopy(&last_time, &usage.du_stime,
+			    sizeof (usage.du_stime));
+			bcopy(&ns->net_stat_ctime, &usage.du_etime,
+			    sizeof (usage.du_etime));
+			usage.du_rbytes = tot_ibytes;
+			usage.du_obytes = tot_obytes;
+			usage.du_bandwidth = tot_bytes*8/tot_time;
+			usage.du_last = B_TRUE;
+			fn(&usage, arg);
+
+			NET_RESET_TOT(tot_bytes, tot_time, tot_ibytes,
+			    tot_obytes, step);
+			last_time = ns->net_stat_ctime;
+		} else {
+			tot_bytes += ns->net_stat_ibytes + ns->net_stat_obytes;
+			tot_ibytes += ns->net_stat_ibytes;
+			tot_obytes += ns->net_stat_obytes;
+			tot_time += ns->net_stat_tdiff;
+		}
+		start = start->net_time_entry_next;
+	}
+
+	if (tot_bytes != 0) {
+		bcopy(&ns->net_stat_name, &usage.du_name,
+		    sizeof (usage.du_name));
+		bcopy(&last_time, &usage.du_stime,
+		    sizeof (usage.du_stime));
+		bcopy(&ns->net_stat_ctime, &usage.du_etime,
+		    sizeof (usage.du_etime));
+		usage.du_rbytes = tot_ibytes;
+		usage.du_obytes = tot_obytes;
+		usage.du_bandwidth = tot_bytes*8/tot_time;
+		usage.du_last = B_TRUE;
+		fn(&usage, arg);
+	}
+
+	free_logtable(net_table);
+	return (status);
+}
+
+/*
+ * Walk the time sorted list if a resource is not specified.
+ */
+dladm_status_t
+dladm_walk_usage_time(int (*fn)(dladm_usage_t *, void *), int logtype,
+    char *logfile, char *stime, char *etime, void *arg)
+{
+	net_table_t		*net_table;
+	net_time_entry_t	*start;
+	net_stat_t		*ns = NULL, *nns;
+	net_time_t		st, et, *t1;
+	net_desc_t		*nd;
+	net_entry_t		*ne;
+	net_plot_entry_t	*pe;
+	int			count;
+	int			step = 1;
+	int			nentries = 0, pentries = 0;
+	uint64_t		last_time;
+	dladm_status_t		status;
+
+	/* Parse the log file */
+	net_table = parse_logfile(logfile, logtype, &status);
+	if (net_table == NULL)
+		return (status);
+
+	if (net_table->net_entries == 0)
+		return (DLADM_STATUS_OK);
+	start = net_table->net_time_head;
+
+	/* Find the first and last records and starting point */
+	status = get_time_range(net_table->net_time_head,
+	    net_table->net_time_tail, &st, &et, stime, etime);
+	if (status != DLADM_STATUS_OK)
+		return (status);
+	get_starting_point(start, &start, &st, stime, &last_time);
+	/*
+	 * Could assert to be non-null, since get_time_range()
+	 * would have adjusted.
+	 */
+	if (start == NULL)
+		return (DLADM_STATUS_BADTIMEVAL);
+
+	/*
+	 * Collect entries for all resources in a time slot before
+	 * writing to the file.
+	 */
+	nentries = net_table->net_entries;
+
+	pe = malloc(sizeof (net_plot_entry_t) * net_table->net_entries + 1);
+	if (pe == NULL)
+		return (DLADM_STATUS_NOMEM);
+
+	ne = net_table->net_table_head;
+	for (count = 0; count < nentries; count++) {
+		nd = ne->net_entry_desc;
+		pe[count].net_pe_name = nd->net_desc_name;
+		ne = ne->net_entry_next;
+	}
+
+	clear_pe(pe, nentries, &pentries);
+
+	/* Write header to file */
+	/* add_pe_to_file(fn, pe, ns, nentries, arg); */
+
+	t1 = &start->my_time_stat->net_stat_time;
+
+	while (start != NULL) {
+
+		nns = start->my_time_stat;
+		/*
+		 * We have crossed the time boundary, check if we need to
+		 * print out now.
+		 */
+		if (compare_time(&nns->net_stat_time, t1) ==
+		    NET_TIME_GREATER) {
+			/* return if we are out of the range */
+			if (etime != NULL &&
+			    compare_time(&nns->net_stat_time, &et) ==
+			    NET_TIME_GREATER) {
+				if (pentries > 0) {
+					add_pe_to_file(fn, pe, ns, nentries,
+					    arg);
+					clear_pe(pe, nentries, &pentries);
+				}
+				free(pe);
+				return (DLADM_STATUS_OK);
+			}
+			/* update the stats from the ns. */
+			t1 = &nns->net_stat_time;
+			last_time = ns->net_stat_ctime;
+			if (--step == 0) {
+				if (pentries > 0) {
+					add_pe_to_file(fn, pe, ns, nentries,
+					    arg);
+					clear_pe(pe, nentries, &pentries);
+				}
+				step = 1;
+			}
+		}
+
+		/*
+		 * if this is a reference entry, just print what we have
+		 * for this resource and proceed. We will end up writing
+		 * the stats for all the entries when we hit a ref element,
+		 * which means 'steps' for some might not be accurate, but
+		 * that is fine, the alternative is to write only the
+		 * resource for which we hit a reference entry.
+		 */
+		if (nns->net_stat_isref) {
+			if (pentries > 0) {
+				add_pe_to_file(fn, pe, ns, nentries, arg);
+				clear_pe(pe, nentries, &pentries);
+			}
+			step = 1;
+		} else {
+			update_pe(pe, nns, nentries, &pentries, last_time);
+		}
+		ns = nns;
+		start = start->net_time_entry_next;
+	}
+
+	if (pentries > 0)
+		add_pe_to_file(fn, pe, ns, nentries, arg);
+
+	free(pe);
+	free_logtable(net_table);
+
+	return (DLADM_STATUS_OK);
+}
+
+dladm_status_t
+dladm_usage_summary(int (*fn)(dladm_usage_t *, void *), int logtype,
+    char *logfile, void *arg)
+{
+	net_table_t		*net_table;
+	net_entry_t		*ne;
+	net_desc_t		*nd;
+	net_stat_t		*ns;
+	int			count;
+	dladm_usage_t		usage;
+	dladm_status_t		status;
+
+	/* Parse the log file */
+	net_table = parse_logfile(logfile, logtype, &status);
+	if (net_table == NULL)
+		return (status);
+
+	if (net_table->net_entries == 0)
+		return (DLADM_STATUS_OK);
+
+	ne = net_table->net_table_head;
+	for (count = 0; count < net_table->net_entries; count++) {
+		ns = ne->net_entry_tstats;
+		nd = ne->net_entry_desc;
+
+		if (ns->net_stat_ibytes + ns->net_stat_obytes == 0)
+			continue;
+		bcopy(&nd->net_desc_name, &usage.du_name,
+		    sizeof (usage.du_name));
+		usage.du_duration = ne->net_entry_ttime;
+		usage.du_ipackets = ns->net_stat_ipackets;
+		usage.du_rbytes = ns->net_stat_ibytes;
+		usage.du_opackets = ns->net_stat_opackets;
+		usage.du_obytes = ns->net_stat_obytes;
+		usage.du_bandwidth =
+		    (ns->net_stat_ibytes + ns->net_stat_obytes) * 8 /
+		    usage.du_duration;
+		usage.du_last = (count == net_table->net_entries-1);
+		fn(&usage, arg);
+
+		ne = ne->net_entry_next;
+	}
+
+	free_logtable(net_table);
+	return (DLADM_STATUS_OK);
+}
+
+/*
+ * Walk the ctime list and display the dates of the records.
+ */
+dladm_status_t
+dladm_usage_dates(int (*fn)(dladm_usage_t *, void *), int logtype,
+    char *logfile, char *resource, void *arg)
+{
+	net_table_t		*net_table;
+	net_time_entry_t	*start;
+	net_stat_t		*nns;
+	net_time_t		st;
+	net_time_t		*lasttime = NULL;
+	uint64_t		last_time;
+	boolean_t		gotstart = B_FALSE;
+	dladm_status_t		status;
+	dladm_usage_t		usage;
+
+	/* Parse the log file */
+	net_table = parse_logfile(logfile, logtype, &status);
+	if (net_table == NULL)
+		return (status);
+
+	if (net_table->net_entries == 0)
+		return (DLADM_STATUS_OK);
+
+	start = net_table->net_ctime_head;
+
+	while (start != NULL) {
+		nns = start->my_time_stat;
+
+		/* get to the resource we are interested in */
+		if (resource != NULL) {
+			if ((strlen(resource) != strlen(nns->net_stat_name)) ||
+			    (strncmp(resource, nns->net_stat_name,
+			    strlen(nns->net_stat_name)) != 0)) {
+				start = start->net_time_entry_next;
+				continue;
+			}
+		}
+
+		/* get the starting point in the logfile */
+		if (!gotstart) {
+			get_starting_point(start, &start, &st, NULL,
+			    &last_time);
+			if (start == NULL)
+				break;
+			nns = start->my_time_stat;
+			gotstart = B_TRUE;
+		}
+
+		if (lasttime == NULL ||
+		    compare_date(&nns->net_stat_time, lasttime) ==
+		    NET_DATE_GREATER) {
+			bzero(&usage, sizeof (dladm_usage_t));
+			bcopy(&nns->net_stat_ctime, &usage.du_stime,
+			    sizeof (usage.du_stime));
+			fn(&usage, arg);
+			lasttime = &nns->net_stat_time;
+		}
+
+		start = start->net_time_entry_next;
+		continue;
+	}
+
+	free_logtable(net_table);
+	return (status);
+}
diff --git a/usr/src/lib/libsecdb/exec_attr.txt b/usr/src/lib/libsecdb/exec_attr.txt
index ae7d769e2a..e0ef11b073 100644
--- a/usr/src/lib/libsecdb/exec_attr.txt
+++ b/usr/src/lib/libsecdb/exec_attr.txt
@@ -193,6 +193,8 @@ Network Management:solaris:cmd:::/sbin/routeadm:euid=0;\
 	privs=proc_chroot,proc_owner,sys_ip_config
 Network Management:solaris:cmd:::/sbin/dladm:euid=dladm;egid=sys;\
 	privs=sys_dl_config,net_rawaccess,proc_audit
+Network Management:solaris:cmd:::/sbin/flowadm:euid=dladm;egid=sys;\
+	privs=sys_dl_config,net_rawaccess,proc_audit
 Network Management:suser:cmd:::/usr/bin/netstat:uid=0
 Network Management:suser:cmd:::/usr/bin/rup:euid=0
 Network Management:suser:cmd:::/usr/bin/ruptime:euid=0
diff --git a/usr/src/lib/libsecdb/help/auths/Makefile b/usr/src/lib/libsecdb/help/auths/Makefile
index 8bc756895f..42d1d72c96 100644
--- a/usr/src/lib/libsecdb/help/auths/Makefile
+++ b/usr/src/lib/libsecdb/help/auths/Makefile
@@ -70,6 +70,7 @@ HTMLENTS = \
 	SmfExAcctFlowStates.html \
 	SmfExAcctProcessStates.html \
 	SmfExAcctTaskStates.html \
+	SmfExAcctNetStates.html \
 	SmfHeader.html \
 	SmfInetdStates.html \
 	SmfIPsecStates.html \
@@ -93,6 +94,7 @@ HTMLENTS = \
 	SmfValueExAcctFlow.html \
 	SmfValueExAcctProcess.html \
 	SmfValueExAcctTask.html \
+	SmfValueExAcctNet.html \
 	SmfVtStates.html \
 	SmfValueHeader.html \
 	SmfValueInetd.html \
diff --git a/usr/src/lib/libsecdb/help/auths/SmfExAcctNetStates.html b/usr/src/lib/libsecdb/help/auths/SmfExAcctNetStates.html
new file mode 100644
index 0000000000..e042637323
--- /dev/null
+++ b/usr/src/lib/libsecdb/help/auths/SmfExAcctNetStates.html
@@ -0,0 +1,37 @@
+<HTML>
+<!--
+    CDDL HEADER START
+
+    The contents of this file are subject to the terms of the
+    Common Development and Distribution License (the "License").
+    You may not use this file except in compliance with the License.
+
+    You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+    or http://www.opensolaris.org/os/licensing.
+    See the License for the specific language governing permissions
+    and limitations under the License.
+
+    When distributing Covered Code, include this CDDL HEADER in each
+    file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+    If applicable, add the following below this CDDL HEADER, with the
+    fields enclosed by brackets "[]" replaced with your own identifying
+    information: Portions Copyright [yyyy] [name of copyright owner]
+
+    CDDL HEADER END
+
+Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+Use is subject to license terms.
+-->
+<!--
+   <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
+-->
+<BODY>
+When Manage Net Extended Accounting Service States is in the Authorizations 
+Included column, it grants the authorization to enable or disable net 
+extended accounting.
+<p>
+If Manage Net Extended Accounting Service States is grayed, then you are not 
+entitled to Add or Remove this authorization.
+<BR>&nbsp;
+</BODY>
+</HTML>
diff --git a/usr/src/lib/libsecdb/help/auths/SmfValueExAcctNet.html b/usr/src/lib/libsecdb/help/auths/SmfValueExAcctNet.html
new file mode 100644
index 0000000000..52f735c4b9
--- /dev/null
+++ b/usr/src/lib/libsecdb/help/auths/SmfValueExAcctNet.html
@@ -0,0 +1,35 @@
+<HTML>
+<!--
+    CDDL HEADER START
+
+    The contents of this file are subject to the terms of the
+    Common Development and Distribution License (the "License").
+    You may not use this file except in compliance with the License.
+
+    You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+    or http://www.opensolaris.org/os/licensing.
+    See the License for the specific language governing permissions
+    and limitations under the License.
+
+    When distributing Covered Code, include this CDDL HEADER in each
+    file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+    If applicable, add the following below this CDDL HEADER, with the
+    fields enclosed by brackets "[]" replaced with your own identifying
+    information: Portions Copyright [yyyy] [name of copyright owner]
+
+    CDDL HEADER END
+
+Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+Use is subject to license terms.
+-->
+
+<BODY>
+When Change Values of Net Extended Accounting Service Properties is in the 
+Authorizations Included column, it grants the the authorization to change 
+net extended accounting configuration parameter values.
+<P> 
+If Change Values of Net Extended Accounting Service Properties is grayed, 
+then you are not entitled to Add or Remove this authorization.
+<p>
+</BODY>
+</HTML>
diff --git a/usr/src/lib/libsecdb/help/profiles/Makefile b/usr/src/lib/libsecdb/help/profiles/Makefile
index 37f9608f0b..0d93f0929b 100644
--- a/usr/src/lib/libsecdb/help/profiles/Makefile
+++ b/usr/src/lib/libsecdb/help/profiles/Makefile
@@ -38,6 +38,7 @@ HTMLENTS = \
 	RtExAcctFlow.html \
 	RtExAcctProcess.html \
 	RtExAcctTask.html \
+	RtExAcctNet.html \
 	RtLogMngmnt.html \
 	RtDeviceMngmnt.html \
 	RtDeviceSecurity.html \
diff --git a/usr/src/lib/libsecdb/help/profiles/RtExAcctNet.html b/usr/src/lib/libsecdb/help/profiles/RtExAcctNet.html
new file mode 100644
index 0000000000..25861d980e
--- /dev/null
+++ b/usr/src/lib/libsecdb/help/profiles/RtExAcctNet.html
@@ -0,0 +1,39 @@
+<HTML>
+<!--
+    CDDL HEADER START
+
+    The contents of this file are subject to the terms of the
+    Common Development and Distribution License (the "License").
+    You may not use this file except in compliance with the License.
+
+    You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+    or http://www.opensolaris.org/os/licensing.
+    See the License for the specific language governing permissions
+    and limitations under the License.
+
+    When distributing Covered Code, include this CDDL HEADER in each
+    file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+    If applicable, add the following below this CDDL HEADER, with the
+    fields enclosed by brackets "[]" replaced with your own identifying
+    information: Portions Copyright [yyyy] [name of copyright owner]
+
+    CDDL HEADER END
+
+-- Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+-- Use is subject to license terms.
+-->
+<HEAD>
+	<TITLE> </TITLE>
+	 
+	
+</HEAD>
+<BODY>
+When Manage the Net Extended Accounting service is in the Rights Included 
+column, it grants the right to commands needed to administer net extended 
+accounting.
+<p>
+If Manage the Net Extended Accounting service is grayed, then you are not 
+entitled to Add or Remove this right.
+<p>
+</BODY>
+</HTML>
diff --git a/usr/src/lib/libsecdb/prof_attr.txt b/usr/src/lib/libsecdb/prof_attr.txt
index 9799ec15c2..ccf8b5081f 100644
--- a/usr/src/lib/libsecdb/prof_attr.txt
+++ b/usr/src/lib/libsecdb/prof_attr.txt
@@ -44,6 +44,7 @@ DHCP Management:::Manage the DHCP service:auths=solaris.dhcpmgr.*;help=RtDHCPMng
 Extended Accounting Flow Management:::Manage the Flow Extended Accounting service:auths=solaris.smf.manage.extended-accounting.flow,solaris.smf.value.extended-accounting.flow;profiles=acctadm;help=RtExActtFlow.html
 Extended Accounting Process Management:::Manage the Process Extended Accounting service:auths=solaris.smf.manage.extended-accounting.process,solaris.smf.value.extended-accounting.process;profiles=acctadm;hep=RtExAcctProcess.html
 Extended Accounting Task Management:::Manage the Task Extended Accounting service:auths=solaris.smf.manage.extended-accounting.task,solaris.smf.value.extended-accounting.task;profiles=acctadm;help=RtExAcctTask.html
+Extended Accounting Net Management:::Manage the Net Extended Accounting service:auths=solaris.smf.manage.extended-accounting.net,solaris.smf.value.extended-accounting.net;profiles=acctadm;help=RtExActtNet.html
 File System Management:::Manage, mount, share file systems:profiles=SMB Management,VSCAN Management,SMBFS Management;auths=solaris.smf.manage.autofs,solaris.smf.manage.shares.*,solaris.smf.value.shares.*;help=RtFileSysMngmnt.html
 File System Security:::Manage file system security attributes:help=RtFileSysSecurity.html
 HAL Management:::Manage HAL SMF service:auths=solaris.smf.manage.hal;help=RtHALMngmnt.html
diff --git a/usr/src/pkgdefs/SUNW0on/prototype_com b/usr/src/pkgdefs/SUNW0on/prototype_com
index 14419f0097..34c71c492a 100644
--- a/usr/src/pkgdefs/SUNW0on/prototype_com
+++ b/usr/src/pkgdefs/SUNW0on/prototype_com
@@ -242,6 +242,7 @@ f none usr/lib/help/auths/locale/SmfCronStates.html 444 root bin
 f none usr/lib/help/auths/locale/SmfExAcctFlowStates.html 444 root bin
 f none usr/lib/help/auths/locale/SmfExAcctProcessStates.html 444 root bin
 f none usr/lib/help/auths/locale/SmfExAcctTaskStates.html 444 root bin
+f none usr/lib/help/auths/locale/SmfExAcctNetStates.html 444 root bin
 f none usr/lib/help/auths/locale/SmfHeader.html 444 root bin
 f none usr/lib/help/auths/locale/SmfInetdStates.html 444 root bin
 f none usr/lib/help/auths/locale/SmfManageHeader.html 444 root bin
@@ -267,6 +268,7 @@ f none usr/lib/help/auths/locale/SmfValueCoreadm.html 444 root bin
 f none usr/lib/help/auths/locale/SmfValueExAcctFlow.html 444 root bin
 f none usr/lib/help/auths/locale/SmfValueExAcctProcess.html 444 root bin
 f none usr/lib/help/auths/locale/SmfValueExAcctTask.html 444 root bin
+f none usr/lib/help/auths/locale/SmfValueExAcctNet.html 444 root bin
 f none usr/lib/help/auths/locale/SmfVtStates.html 444 root bin
 f none usr/lib/help/auths/locale/SmfValueHeader.html 444 root bin
 f none usr/lib/help/auths/locale/SmfValueInetd.html 444 root bin
@@ -344,6 +346,7 @@ f none usr/lib/help/profiles/locale/RtDeviceMngmnt.html 444 root bin
 f none usr/lib/help/profiles/locale/RtExAcctFlow.html 444 root bin
 f none usr/lib/help/profiles/locale/RtExAcctProcess.html 444 root bin
 f none usr/lib/help/profiles/locale/RtExAcctTask.html 444 root bin
+f none usr/lib/help/profiles/locale/RtExAcctNet.html 444 root bin
 f none usr/lib/help/profiles/locale/RtPrntAdmin.html 444 root bin
 f none usr/lib/help/profiles/locale/RtConsUser.html 444 root bin
 f none usr/lib/help/profiles/locale/RtContractObserver.html 444 root bin
diff --git a/usr/src/pkgdefs/SUNWcnetr/postinstall b/usr/src/pkgdefs/SUNWcnetr/postinstall
index cb6ab86de9..20d09c70ee 100644
--- a/usr/src/pkgdefs/SUNWcnetr/postinstall
+++ b/usr/src/pkgdefs/SUNWcnetr/postinstall
@@ -109,6 +109,44 @@ if [ -f "${ORIG}" ]; then
 	removef -f $PKGINST > /dev/null 2>&1
 fi
 
+# Convert hostname.xxx and zonecfg vlan entries
+host_ifs=`ls -1 $rootprefix/etc | egrep -e '^hostname.|^hostname6.|^dhcp.'| \
+    cut -d . -f2 | sort -u` 
+
+zones=`zoneadm list -c | grep -v global`
+for zone in $zones
+do
+	zonecfg -z $zone info ip-type | grep exclusive >/dev/null
+	if [ $? -eq 0 ]; then
+		zif=`zonecfg -z $zone info net | grep physical | \
+		    nawk '{print $2}'`
+		zone_ifs="$zone_ifs $zif"
+	fi
+done
+
+ORIG=$BASEDIR/etc/dladm/datalink.conf
+for ifname in $host_ifs $zone_ifs
+do
+	grep $ifname $ORIG >/dev/null
+	if [ $? != 0 ]; then
+		phys=`echo $ifname | sed "s/[0-9]*$//"`
+		devnum=`echo $ifname | sed "s/$phys//g"`
+		if [ "$phys$devnum" != $ifname -o \
+		    -n "`echo $devnum | tr -d '[0-9]'`" ]; then
+			echo "skipping invalid interface $ifname"
+			continue
+		fi
+
+		vid=`expr $devnum / 1000`
+		inst=`expr $devnum % 1000`
+
+		if [ "$vid" != "0" ]; then
+			echo dladm create-vlan -l $phys$inst -v $vid \
+			    $ifname >> ${PKG_INSTALL_ROOT}/$UPGRADE_SCRIPT
+		fi
+	fi
+done
+
 #
 # Change permissions of public IKE certificates and CRLs
 # that may have been incorrectly created as private
diff --git a/usr/src/pkgdefs/SUNWcnetr/prototype_com b/usr/src/pkgdefs/SUNWcnetr/prototype_com
index 307a2a7303..7091ec4bc5 100644
--- a/usr/src/pkgdefs/SUNWcnetr/prototype_com
+++ b/usr/src/pkgdefs/SUNWcnetr/prototype_com
@@ -22,7 +22,6 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
 #
 # This required package information file contains a list of package contents.
 # The 'pkgmk' command uses this file to identify the contents of a package
@@ -53,6 +52,8 @@ d none etc 755 root sys
 d none etc/dladm 755 dladm sys
 e preserve etc/dladm/secobj.conf 600 dladm sys
 e preserve etc/dladm/datalink.conf 644 dladm sys
+e preserve etc/dladm/flowadm.conf 644 dladm sys
+e preserve etc/dladm/flowprop.conf 644 dladm sys
 d none etc/default 755 root sys
 e dhcpagent etc/default/dhcpagent 644 root sys
 e preserve etc/default/inetinit 644 root sys
@@ -74,3 +75,4 @@ e sock2path etc/inet/sock2path 444 root sys
 s none etc/sock2path=./inet/sock2path
 d none sbin 755 root sys
 f none sbin/dladm 555 root bin
+f none sbin/flowadm 555 root bin
diff --git a/usr/src/pkgdefs/SUNWcsu/prototype_com b/usr/src/pkgdefs/SUNWcsu/prototype_com
index c3505988cb..b1021d4267 100644
--- a/usr/src/pkgdefs/SUNWcsu/prototype_com
+++ b/usr/src/pkgdefs/SUNWcsu/prototype_com
@@ -482,6 +482,7 @@ f none usr/lib/help/auths/locale/C/SmfCronStates.html 444 root bin
 f none usr/lib/help/auths/locale/C/SmfExAcctFlowStates.html 444 root bin
 f none usr/lib/help/auths/locale/C/SmfExAcctProcessStates.html 444 root bin
 f none usr/lib/help/auths/locale/C/SmfExAcctTaskStates.html 444 root bin
+f none usr/lib/help/auths/locale/C/SmfExAcctNetStates.html 444 root bin
 f none usr/lib/help/auths/locale/C/SmfHeader.html 444 root bin
 f none usr/lib/help/auths/locale/C/SmfManageHeader.html 444 root bin
 f none usr/lib/help/auths/locale/C/SmfMDNSStates.html 444 root bin
@@ -506,6 +507,7 @@ f none usr/lib/help/auths/locale/C/SmfValueCoreadm.html 444 root bin
 f none usr/lib/help/auths/locale/C/SmfValueExAcctFlow.html 444 root bin
 f none usr/lib/help/auths/locale/C/SmfValueExAcctProcess.html 444 root bin
 f none usr/lib/help/auths/locale/C/SmfValueExAcctTask.html 444 root bin
+f none usr/lib/help/auths/locale/C/SmfValueExAcctNet.html 444 root bin
 f none usr/lib/help/auths/locale/C/SmfVtStates.html 444 root bin
 f none usr/lib/help/auths/locale/C/SmfValueHeader.html 444 root bin
 f none usr/lib/help/auths/locale/C/SmfValueInetd.html 444 root bin
@@ -564,6 +566,7 @@ f none usr/lib/help/profiles/locale/C/RtCryptoMngmnt.html 444 root bin
 f none usr/lib/help/profiles/locale/C/RtExAcctFlow.html 444 root bin
 f none usr/lib/help/profiles/locale/C/RtExAcctProcess.html 444 root bin
 f none usr/lib/help/profiles/locale/C/RtExAcctTask.html 444 root bin
+f none usr/lib/help/profiles/locale/C/RtExAcctNet.html 444 root bin
 f none usr/lib/help/profiles/locale/C/RtDHCPMngmnt.html 444 root bin
 f none usr/lib/help/profiles/locale/C/RtDatAdmin.html 444 root bin
 f none usr/lib/help/profiles/locale/C/RtDefault.html 444 root bin
@@ -683,6 +686,7 @@ f none usr/lib/rcm/modules/SUNW_ip_rcm.so 555 root bin
 f none usr/lib/rcm/modules/SUNW_mpxio_rcm.so 555 root bin
 f none usr/lib/rcm/modules/SUNW_network_rcm.so 555 root bin
 f none usr/lib/rcm/modules/SUNW_vlan_rcm.so 555 root bin
+f none usr/lib/rcm/modules/SUNW_vnic_rcm.so 555 root bin
 f none usr/lib/rcm/modules/SUNW_aggr_rcm.so 555 root bin
 f none usr/lib/rcm/modules/SUNW_swap_rcm.so 555 root bin
 f none usr/lib/rcm/rcm_daemon 555 root bin
@@ -828,6 +832,7 @@ s none usr/sbin/edquota=../lib/fs/ufs/edquota
 f none usr/sbin/eeprom 2555 root sys
 s none usr/sbin/fdisk=../../sbin/fdisk
 f none usr/sbin/ff 555 root bin
+s none usr/sbin/flowadm=../../sbin/flowadm
 s none usr/sbin/fiocompress=../../sbin/fiocompress
 f none usr/sbin/fmthard 555 root sys
 f none usr/sbin/format 555 root bin
diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_i386 b/usr/src/pkgdefs/SUNWmdb/prototype_i386
index f7620e480d..05c255e659 100644
--- a/usr/src/pkgdefs/SUNWmdb/prototype_i386
+++ b/usr/src/pkgdefs/SUNWmdb/prototype_i386
@@ -71,6 +71,7 @@ f none usr/lib/mdb/kvm/amd64/ipp.so 555 root sys
 f none usr/lib/mdb/kvm/amd64/krtld.so 555 root sys
 f none usr/lib/mdb/kvm/amd64/lofs.so 555 root sys
 f none usr/lib/mdb/kvm/amd64/logindmux.so 555 root sys
+f none usr/lib/mdb/kvm/amd64/mac.so 555 root sys
 f none usr/lib/mdb/kvm/amd64/md.so 555 root sys
 f none usr/lib/mdb/kvm/amd64/mdb_kb.so 555 root sys
 f none usr/lib/mdb/kvm/amd64/mdb_ks.so 555 root sys
@@ -103,6 +104,7 @@ f none usr/lib/mdb/kvm/ipp.so 555 root sys
 f none usr/lib/mdb/kvm/krtld.so 555 root sys
 f none usr/lib/mdb/kvm/lofs.so 555 root sys
 f none usr/lib/mdb/kvm/logindmux.so 555 root sys
+f none usr/lib/mdb/kvm/mac.so 555 root sys
 f none usr/lib/mdb/kvm/md.so 555 root sys
 f none usr/lib/mdb/kvm/mdb_kb.so 555 root sys
 f none usr/lib/mdb/kvm/mdb_ks.so 555 root sys
diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_sparc b/usr/src/pkgdefs/SUNWmdb/prototype_sparc
index 7e6878d47e..51f5c49182 100644
--- a/usr/src/pkgdefs/SUNWmdb/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWmdb/prototype_sparc
@@ -19,10 +19,9 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
 
 !include prototype_com
 
@@ -53,6 +52,7 @@ f none usr/lib/mdb/kvm/sparcv9/isp.so 555 root sys
 f none usr/lib/mdb/kvm/sparcv9/krtld.so 555 root sys
 f none usr/lib/mdb/kvm/sparcv9/lofs.so 555 root sys
 f none usr/lib/mdb/kvm/sparcv9/logindmux.so 555 root sys
+f none usr/lib/mdb/kvm/sparcv9/mac.so 555 root sys
 f none usr/lib/mdb/kvm/sparcv9/md.so 555 root sys
 f none usr/lib/mdb/kvm/sparcv9/mdb_ks.so 555 root sys
 f none usr/lib/mdb/kvm/sparcv9/mpt.so 555 root sys
diff --git a/usr/src/pkgdefs/SUNWmdbr/prototype_i386 b/usr/src/pkgdefs/SUNWmdbr/prototype_i386
index 24755c9731..237c1da83b 100644
--- a/usr/src/pkgdefs/SUNWmdbr/prototype_i386
+++ b/usr/src/pkgdefs/SUNWmdbr/prototype_i386
@@ -41,6 +41,7 @@ f none kernel/kmdb/amd64/ipp 555 root sys
 f none kernel/kmdb/amd64/krtld 555 root sys
 f none kernel/kmdb/amd64/lofs 555 root sys
 f none kernel/kmdb/amd64/logindmux 555 root sys
+f none kernel/kmdb/amd64/mac 555 root sys
 f none kernel/kmdb/amd64/md 555 root sys
 f none kernel/kmdb/amd64/mdb_ds 555 root sys
 f none kernel/kmdb/amd64/mpt 555 root sys
@@ -72,6 +73,7 @@ f none kernel/kmdb/ipp 555 root sys
 f none kernel/kmdb/krtld 555 root sys
 f none kernel/kmdb/lofs 555 root sys
 f none kernel/kmdb/logindmux 555 root sys
+f none kernel/kmdb/mac 555 root sys
 f none kernel/kmdb/md 555 root sys
 f none kernel/kmdb/mdb_ds 555 root sys
 f none kernel/kmdb/mpt 555 root sys
diff --git a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc
index 99bb424c63..b4057c2328 100644
--- a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc
@@ -19,10 +19,9 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
 #
 
 !include prototype_com
@@ -43,6 +42,7 @@ f none kernel/kmdb/sparcv9/isp 555 root sys
 f none kernel/kmdb/sparcv9/krtld 555 root sys
 f none kernel/kmdb/sparcv9/lofs 555 root sys
 f none kernel/kmdb/sparcv9/logindmux 555 root sys
+f none kernel/kmdb/sparcv9/mac 555 root sys
 f none kernel/kmdb/sparcv9/md 555 root sys
 f none kernel/kmdb/sparcv9/mdb_ds 555 root sys
 f none kernel/kmdb/sparcv9/mpt 555 root sys
diff --git a/usr/src/pkgdefs/etc/exception_list_i386 b/usr/src/pkgdefs/etc/exception_list_i386
index ee2ddf8352..e7a2d79ed1 100644
--- a/usr/src/pkgdefs/etc/exception_list_i386
+++ b/usr/src/pkgdefs/etc/exception_list_i386
@@ -89,12 +89,21 @@ usr/include/sys/dld.h			i386
 usr/include/sys/dld_impl.h		i386
 usr/include/sys/dld_ioc.h		i386
 usr/include/sys/dls.h			i386
+usr/include/sys/dls_mgmt.h		i386
 usr/include/sys/dls_impl.h		i386
 usr/include/sys/mac.h			i386
+usr/include/sys/mac_client.h		i386
+usr/include/sys/mac_client_impl.h	i386
+usr/include/sys/mac_flow.h		i386
+usr/include/sys/mac_flow_impl.h		i386
 usr/include/sys/mac_impl.h		i386
+usr/include/sys/mac_provider.h		i386
+usr/include/sys/mac_soft_ring.h		i386
 #
 # Private GLDv3 userland libraries and headers
 #
+usr/include/sys/vnic.h		i386
+usr/include/sys/vnic_impl.h	i386
 usr/include/libdladm.h		i386
 usr/include/libdladm_impl.h	i386
 usr/include/libdllink.h		i386
@@ -102,8 +111,11 @@ usr/include/libdlaggr.h		i386
 usr/include/libdlwlan.h		i386
 usr/include/libdlwlan_impl.h	i386
 usr/include/libdlvnic.h		i386
+usr/include/libdlflow.h		i386
+usr/include/libdlflow_impl.h	i386
 usr/include/libdlvlan.h		i386
 usr/include/libdlmgmt.h		i386
+usr/include/libdlstat.h		i386
 lib/libdladm.so			i386
 lib/llib-ldladm.ln		i386
 lib/amd64/libdladm.so		i386
@@ -528,6 +540,7 @@ lib/llib-lmeta.ln			i386
 # non-public pci header
 #
 usr/include/sys/pci_impl.h		i386
+usr/include/sys/pci_tools.h		i386
 #
 # Exception list for RCM project, included by librcm and rcm_daemon
 #
diff --git a/usr/src/pkgdefs/etc/exception_list_sparc b/usr/src/pkgdefs/etc/exception_list_sparc
index ece69f8eef..005ace8c07 100644
--- a/usr/src/pkgdefs/etc/exception_list_sparc
+++ b/usr/src/pkgdefs/etc/exception_list_sparc
@@ -78,21 +78,33 @@ usr/include/sys/dld.h			sparc
 usr/include/sys/dld_impl.h		sparc
 usr/include/sys/dld_ioc.h		sparc
 usr/include/sys/dls.h			sparc
+usr/include/sys/dls_mgmt.h		sparc
 usr/include/sys/dls_impl.h		sparc
 usr/include/sys/mac.h			sparc
+usr/include/sys/mac_client.h		sparc
+usr/include/sys/mac_client_impl.h	sparc
+usr/include/sys/mac_flow.h		sparc
+usr/include/sys/mac_flow_impl.h		sparc
 usr/include/sys/mac_impl.h		sparc
+usr/include/sys/mac_provider.h		sparc
+usr/include/sys/mac_soft_ring.h		sparc
 #
 # Private GLDv3 userland libraries and headers
 #
+usr/include/sys/vnic.h		sparc
+usr/include/sys/vnic_impl.h	sparc
 usr/include/libdladm.h		sparc
 usr/include/libdladm_impl.h	sparc
 usr/include/libdllink.h		sparc
 usr/include/libdlaggr.h		sparc
+usr/include/libdlflow.h		sparc
+usr/include/libdlflow_impl.h	sparc
 usr/include/libdlwlan.h		sparc
 usr/include/libdlwlan_impl.h	sparc
 usr/include/libdlvnic.h		sparc
 usr/include/libdlvlan.h		sparc
 usr/include/libdlmgmt.h		sparc
+usr/include/libdlstat.h		sparc
 lib/libdladm.so			sparc
 lib/llib-ldladm.ln		sparc
 lib/sparcv9/libdladm.so		sparc
@@ -531,6 +543,7 @@ lib/llib-lmeta.ln			sparc
 # non-public pci header
 #
 usr/include/sys/pci_impl.h		sparc
+usr/include/sys/pci_tools.h		sparc
 #
 # Exception list for RCM project, included by librcm and rcm_daemon
 #
diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh
index 88404359fd..100d0e594d 100644
--- a/usr/src/tools/scripts/bfu.sh
+++ b/usr/src/tools/scripts/bfu.sh
@@ -666,7 +666,7 @@ inetd_conf_svm_hack() {
 }
 
 upgrade_aggr_and_linkprop () {
-	# Since aggregation.conf and linkprop.conf are upgraded by
+	# Since aggregation.conf and linkprop.conf are upgraded by 
 	# SUNWcnetr's postinstall script, put the relevant portions of the
 	# postinstall script here, modified to rename the old files instead
 	# of removing them.
@@ -756,6 +756,30 @@ upgrade_aggr_and_linkprop () {
 	fi
 }
 
+upgrade_vlan () {
+	# Convert hostname.*** and zonecfg vlan configurations
+	UPGRADE_SCRIPT=/var/svc/profile/upgrade_datalink
+
+	for ifname in $host_ifs $zone_ifs
+	do
+		phys=`echo $ifname | sed "s/[0-9]*$//"`
+		devnum=`echo $ifname | sed "s/$phys//g"`
+		if [ "$phys$devnum" != $ifname -o \
+		    -n "`echo $devnum | tr -d '[0-9]'`" ]; then
+			echo "skipping invalid interface $ifname"
+			continue
+		fi
+
+		vid=`expr $devnum / 1000`
+		inst=`expr $devnum % 1000`
+
+		if [ "$vid" != "0" ]; then
+			echo dladm create-vlan -l $phys$inst -v $vid $ifname \
+			    >> $rootprefix$UPGRADE_SCRIPT
+		fi
+	done
+}
+
 # Update aac.conf for set legacy-name-enable properly
 update_aac_conf()
 {
@@ -1174,6 +1198,24 @@ migrate_acctadm_conf()
 			svcadm enable $fmri
 		fi
 
+		fmri="svc:/system/extended-accounting:net"
+		svccfg -s $fmri setprop config/file = \
+		    ${ACCTADM_NET_FILE:="none"}
+		svccfg -s $fmri setprop config/tracked = \
+		    ${ACCTADM_NET_TRACKED:="none"}
+		svccfg -s $fmri setprop config/untracked = \
+		    ${ACCTADM_NET_UNTRACKED:="extended"}
+		if [ ${ACCTADM_NET_ENABLE:="no"} = "yes" ]; then
+			svccfg -s $fmri setprop config/enabled = "true"
+		else
+			svccfg -s $fmri setprop config/enabled = "false"
+		fi
+		if [ $ACCTADM_NET_ENABLE = "yes" -o \
+		    $ACCTADM_NET_FILE != "none" -o \
+		    $ACCTADM_NET_TRACKED != "none" ]; then
+			svcadm enable $fmri
+		fi
+
 		rm /etc/acctadm.conf
 	fi
 _EOF
@@ -4762,6 +4804,28 @@ then
 	fi
 
 	#
+	# save vlans associated with zones to be upgraded 
+	# to the new dladm based format
+	#
+	flowadm_status="old"
+	if [[ ! -f $root/sbin/flowadm ]] && \
+	    archive_file_exists generic.sbin "sbin/flowadm"; then
+		flowadm_status="new"
+		host_ifs=`ls -1 $rootprefix/etc | egrep -e \
+	  	  '^hostname.|^hostname6.|^dhcp.'|  cut -d . -f2 | sort -u` 
+		zones=`zoneadm list -c | grep -v global`
+		for zone in $zones
+		do
+			zonecfg -z $zone info ip-type | grep exclusive \
+			    >/dev/null
+			if [ $? -eq 0 ]; then
+				zif=`zonecfg -z $zone info net | \
+				    grep physical | nawk '{print $2}'`
+				zone_ifs="$zone_ifs $zif"
+			fi
+		done
+	fi
+	#
 	# Stop sendmail so that mail doesn't bounce during the interval
 	# where /etc/mail/aliases is (effectively) empty.
 	#
@@ -7593,6 +7657,7 @@ mondo_loop() {
 	#
 	rm -f $root/usr/lib/rcm/modules/SUNW_vlan_rcm.so
 	rm -f $root/usr/lib/rcm/modules/SUNW_aggr_rcm.so
+	rm -f $root/usr/lib/rcm/modules/SUNW_vnic_rcm.so
 	rm -f $root/kernel/drv/softmac
 	rm -f $root/kernel/drv/sparcv9/softmac
 	rm -f $root/kernel/drv/amd64/softmac
@@ -8077,6 +8142,11 @@ mondo_loop() {
 			fi
 		fi
 
+		# upgrade hostname and zones based vlans to dladm 
+		if [[ $flowadm_status == "new" ]]; then
+			upgrade_vlan
+		fi
+
 		# The global zone needs to have its /dev/dld symlink created
 		# during install so that processes can access it early in boot
 		# before devfsadm is run.
diff --git a/usr/src/uts/common/Makefile b/usr/src/uts/common/Makefile
index 5b8f6bbc6b..7cf2f14f64 100644
--- a/usr/src/uts/common/Makefile
+++ b/usr/src/uts/common/Makefile
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+#  You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -20,10 +19,9 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2002-2003 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
 #
 # uts/common/Makefile
 #
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 2a54074941..564b2cf72e 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -174,6 +174,7 @@ GENUNIX_OBJS +=	\
 		inet_ntop.o	\
 		instance.o	\
 		ioctl.o		\
+		ip_cksum.o	\
 		issetugid.o	\
 		ippconf.o	\
 		kcpc.o		\
@@ -265,6 +266,7 @@ GENUNIX_OBJS +=	\
 		sidsys.o	\
 		sched.o		\
 		schedctl.o	\
+		sctp_crc32.o	\
 		seg_dev.o	\
 		seg_kp.o	\
 		seg_kpm.o	\
@@ -474,7 +476,7 @@ IP_ICMP_OBJS =	icmp.o icmp_opt_data.o
 IP_RTS_OBJS =	rts.o rts_opt_data.o
 IP_TCP_OBJS =	tcp.o tcp_fusion.o tcp_kssl.o tcp_opt_data.o tcp_sack.o
 IP_UDP_OBJS =	udp.o udp_opt_data.o
-IP_SCTP_OBJS =	sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \
+IP_SCTP_OBJS =	sctp.o sctp_opt_data.o sctp_output.o \
 		sctp_init.o sctp_input.o sctp_cookie.o \
 		sctp_conn.o sctp_error.o sctp_snmp.o \
 		sctp_param.o sctp_shutdown.o sctp_common.o \
@@ -483,7 +485,7 @@ IP_SCTP_OBJS =	sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \
 		sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o
 
 IP_OBJS +=	igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
-		ip_cksum.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
+		ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
 		ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \
 		ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
 		spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \
@@ -560,14 +562,15 @@ CLONE_OBJS +=	clone.o
 
 CN_OBJS +=	cons.o
 
-DLD_OBJS +=	dld_drv.o dld_proto.o dld_str.o
+DLD_OBJS +=	dld_drv.o dld_proto.o dld_str.o dld_flow.o
 
-DLS_OBJS +=	dls.o dls_link.o dls_mod.o dls_stat.o dls_vlan.o \
-		dls_soft_ring.o dls_mgmt.o
+DLS_OBJS +=	dls.o dls_link.o dls_mod.o dls_stat.o dls_mgmt.o
 
 GLD_OBJS +=     gld.o gldutil.o
 
-MAC_OBJS +=     mac.o mac_mod.o mac_stat.o mac_ndd.o
+MAC_OBJS +=     mac.o  mac_bcast.o mac_client.o mac_datapath_setup.o mac_flow.o \
+		mac_hio.o mac_mod.o mac_ndd.o mac_provider.o mac_sched.o \
+		mac_soft_ring.o mac_stat.o mac_util.o
 
 MAC_ETHER_OBJS +=	mac_ether.o
 
@@ -578,8 +581,6 @@ MAC_IB_OBJS +=		mac_ib.o
 AGGR_OBJS +=	aggr_dev.o aggr_ctl.o aggr_grp.o aggr_port.o \
 		aggr_send.o aggr_recv.o aggr_lacp.o
 
-VNIC_OBJS +=	vnic_ctl.o vnic_dev.o vnic_bcast.o vnic_cl.o
-
 SOFTMAC_OBJS += softmac_main.o softmac_ctl.o softmac_capab.o \
 		softmac_dev.o softmac_stat.o softmac_pkt.o
 
@@ -588,6 +589,8 @@ NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \
 		 net80211_crypto_none.o net80211_crypto_wep.o net80211_ioctl.o \
 		 net80211_crypto_tkip.o net80211_crypto_ccmp.o
 
+VNIC_OBJS +=	vnic_ctl.o vnic_dev.o
+
 IB_OBJS +=	ibnex.o ibnex_ioctl.o
 
 IBCM_OBJS +=	ibcm_impl.o ibcm_sm.o ibcm_ti.o ibcm_utils.o ibcm_path.o \
@@ -1724,18 +1727,17 @@ IXGBE_OBJS =    ixgbe_82598.o ixgbe_api.o ixgbe_common.o        \
 #
 #	NIU 10G/1G driver module
 #
-NXGE_OBJS =	nxge_mac.o nxge_ipp.o nxge_rxdma.o 		\
-		nxge_txdma.o nxge_txc.o	nxge_main.o		\
+NXGE_OBJS =	nxge_mac.o nxge_ipp.o nxge_rxdma.o		\
+		nxge_txdma.o nxge_txc.o nxge_main.o		\
 		nxge_hw.o nxge_fzc.o nxge_virtual.o		\
 		nxge_send.o nxge_classify.o nxge_fflp.o		\
 		nxge_fflp_hash.o nxge_ndd.o nxge_kstats.o	\
-		nxge_zcp.o nxge_fm.o nxge_espc.o		\
-		nxge_serialize.o nxge_hv.o			\
+		nxge_zcp.o nxge_fm.o nxge_espc.o nxge_hv.o	\
 		nxge_hio.o nxge_hio_guest.o nxge_intr.o
 
 NXGE_NPI_OBJS =	\
-		npi.o npi_mac.o	npi_ipp.o			\
-		npi_txdma.o npi_rxdma.o	npi_txc.o		\
+		npi.o npi_mac.o npi_ipp.o			\
+		npi_txdma.o npi_rxdma.o npi_txc.o		\
 		npi_zcp.o npi_espc.o npi_fflp.o			\
 		npi_vir.o
 
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 09a34afa80..c7ccff8a14 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -50,6 +50,7 @@ extern "C" {
 #ifdef _KERNEL
 #include <netinet/ip6.h>
 #include <sys/avl.h>
+#include <sys/list.h>
 #include <sys/vmem.h>
 #include <sys/squeue.h>
 #include <net/route.h>
@@ -380,6 +381,13 @@ typedef struct ipf_s {
 	uint32_t	ipf_checksum;	/* Partial checksum of fragment data */
 } ipf_t;
 
+/*
+ * IPv4 Fragments
+ */
+#define	IS_V4_FRAGMENT(ipha_fragment_offset_and_flags)			\
+	(((ntohs(ipha_fragment_offset_and_flags) & IPH_OFFSET) != 0) ||	\
+	((ntohs(ipha_fragment_offset_and_flags) & IPH_MF) != 0))
+
 #define	ipf_src	V4_PART_OF_V6(ipf_v6src)
 #define	ipf_dst	V4_PART_OF_V6(ipf_v6dst)
 
@@ -1718,9 +1726,10 @@ typedef union ill_g_head_u {
 #define	ILL_CAPAB_MDT		0x04		/* Multidata Transmit */
 #define	ILL_CAPAB_HCKSUM	0x08		/* Hardware checksumming */
 #define	ILL_CAPAB_ZEROCOPY	0x10		/* Zero-copy */
-#define	ILL_CAPAB_POLL		0x20		/* Polling Toggle */
-#define	ILL_CAPAB_SOFT_RING	0x40		/* Soft_Ring capability */
-#define	ILL_CAPAB_LSO		0x80		/* Large Segment Offload */
+#define	ILL_CAPAB_DLD		0x20		/* DLD capabilities */
+#define	ILL_CAPAB_DLD_POLL	0x40		/* Polling */
+#define	ILL_CAPAB_DLD_DIRECT	0x80		/* Direct function call */
+#define	ILL_CAPAB_DLD_LSO	0x100		/* Large Segment Offload */
 
 /*
  * Per-ill Multidata Transmit capabilities.
@@ -1743,9 +1752,9 @@ typedef struct ill_hcksum_capab_s ill_hcksum_capab_t;
 typedef struct ill_zerocopy_capab_s ill_zerocopy_capab_t;
 
 /*
- * Per-ill Polling/soft ring capbilities.
+ * DLD capbilities.
  */
-typedef struct ill_dls_capab_s ill_dls_capab_t;
+typedef struct ill_dld_capab_s ill_dld_capab_t;
 
 /*
  * Per-ill polling resource map.
@@ -1762,7 +1771,6 @@ typedef struct ill_lso_capab_s ill_lso_capab_t;
 #define	ILL_CONDEMNED		0x02	/* No more new ref's to the ILL */
 #define	ILL_CHANGING		0x04	/* ILL not globally visible */
 #define	ILL_DL_UNBIND_IN_PROGRESS	0x08	/* UNBIND_REQ is sent */
-#define	ILL_SOFT_RING_ASSIGN	0x10	/* Making soft ring assignment */
 
 /* Is this an ILL whose source address is used by other ILL's ? */
 #define	IS_USESRC_ILL(ill)			\
@@ -1870,8 +1878,10 @@ typedef struct ill_s {
 
 		ill_note_link : 1,	/* supports link-up notification */
 		ill_capab_reneg : 1, /* capability renegotiation to be done */
+		ill_dld_capab_inprog : 1, /* direct dld capab call in prog */
 		ill_need_recover_multicast : 1,
-		ill_pad_to_bit_31 : 17;
+
+		ill_pad_to_bit_31 : 16;
 
 	/* Following bit fields protected by ill_lock */
 	uint_t
@@ -1883,6 +1893,7 @@ typedef struct ill_s {
 		ill_arp_bringup_pending : 1,
 		ill_mtu_userspecified : 1, /* SIOCSLIFLNKINFO has set the mtu */
 		ill_arp_extend : 1,	/* ARP has DAD extensions */
+
 		ill_pad_bit_31 : 25;
 
 	/*
@@ -1903,15 +1914,17 @@ typedef struct ill_s {
 	/*
 	 * Capabilities related fields.
 	 */
-	uint_t  ill_dlpi_capab_state;	/* State of capability query, IDS_* */
+	uint_t  ill_dlpi_capab_state;	/* State of capability query, IDCS_* */
+	uint_t	ill_capab_pending_cnt;
 	uint64_t ill_capabilities;	/* Enabled capabilities, ILL_CAPAB_* */
 	ill_mdt_capab_t	*ill_mdt_capab;	/* Multidata Transmit capabilities */
 	ill_ipsec_capab_t *ill_ipsec_capab_ah;	/* IPsec AH capabilities */
 	ill_ipsec_capab_t *ill_ipsec_capab_esp;	/* IPsec ESP capabilities */
 	ill_hcksum_capab_t *ill_hcksum_capab; /* H/W cksumming capabilities */
 	ill_zerocopy_capab_t *ill_zerocopy_capab; /* Zero-copy capabilities */
-	ill_dls_capab_t *ill_dls_capab; /* Polling, soft ring capabilities */
-	ill_lso_capab_t *ill_lso_capab; /* Large Segment Offload capabilities */
+	ill_dld_capab_t *ill_dld_capab; /* DLD capabilities */
+	ill_lso_capab_t	*ill_lso_capab;	/* Large Segment Offload capabilities */
+	mblk_t	*ill_capab_reset_mp;	/* Preallocated mblk for capab reset */
 
 	/*
 	 * New fields for IPv6
@@ -1989,6 +2002,7 @@ typedef struct ill_s {
 	zoneid_t	ill_zoneid;
 	ip_stack_t	*ill_ipst;	/* Corresponds to a netstack_hold */
 	uint32_t	ill_dhcpinit;	/* IP_DHCPINIT_IFs for ill */
+	void		*ill_flownotify_mh; /* Tx flow ctl, mac cb handle */
 	uint_t		ill_ilm_cnt;    /* ilms referencing this ill */
 	uint_t		ill_ipallmulti_cnt; /* ip_join_allmulti() calls */
 } ill_t;
@@ -2069,6 +2083,7 @@ typedef struct ill_s {
  * ill_type			ipsq + down ill		only when ill is up
  * ill_dlpi_multicast_state	ill_lock		ill_lock
  * ill_dlpi_fastpath_state	ill_lock		ill_lock
+ * ill_dlpi_capab_state		ipsq			ipsq
  * ill_max_hops			ipsq			Not atomic
  *
  * ill_max_mtu
@@ -2110,6 +2125,8 @@ typedef struct ill_s {
  * ill_trace			ill_lock		ill_lock
  * ill_usesrc_grp_next		ill_g_usesrc_lock	ill_g_usesrc_lock
  * ill_dhcpinit			atomics			atomics
+ * ill_flownotify_mh		write once		write once
+ * ill_capab_pending_cnt	ipsq			ipsq
  */
 
 /*
@@ -2182,13 +2199,22 @@ typedef struct ipmx_s {
  * State for detecting if a driver supports certain features.
  * Support for DL_ENABMULTI_REQ uses ill_dlpi_multicast_state.
  * Support for DLPI M_DATA fastpath uses ill_dlpi_fastpath_state.
- * Support for DL_CAPABILITY_REQ uses ill_dlpi_capab_state.
  */
 #define	IDS_UNKNOWN	0	/* No DLPI request sent */
 #define	IDS_INPROGRESS	1	/* DLPI request sent */
 #define	IDS_OK		2	/* DLPI request completed successfully */
 #define	IDS_FAILED	3	/* DLPI request failed */
 
+/* Support for DL_CAPABILITY_REQ uses ill_dlpi_capab_state. */
+enum {
+	IDCS_UNKNOWN,
+	IDCS_PROBE_SENT,
+	IDCS_OK,
+	IDCS_RESET_SENT,
+	IDCS_RENEG,
+	IDCS_FAILED
+};
+
 /* Named Dispatch Parameter Management Structure */
 typedef struct ipparam_s {
 	uint_t	ip_param_min;
@@ -3165,6 +3191,8 @@ extern int	ip_opt_set_ill(conn_t *, int, boolean_t, boolean_t,
 extern void	ip_rput(queue_t *, mblk_t *);
 extern void	ip_input(ill_t *, ill_rx_ring_t *, mblk_t *,
     struct mac_header_info_s *);
+extern mblk_t	*ip_accept_tcp(ill_t *, ill_rx_ring_t *, squeue_t *,
+    mblk_t *, mblk_t **, uint_t *cnt);
 extern void	ip_rput_dlpi(queue_t *, mblk_t *);
 extern void	ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *);
 extern void	ip_rput_forward_multicast(ipaddr_t, mblk_t *, ipif_t *);
@@ -3201,13 +3229,13 @@ extern ipaddr_t ip_net_mask(ipaddr_t);
 extern void	ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t,
 		    ip_stack_t *);
 extern ipxmit_state_t	ip_xmit_v4(mblk_t *, ire_t *, struct ipsec_out_s *,
-    boolean_t);
+    boolean_t, conn_t *);
 extern int	ip_hdr_complete(ipha_t *, zoneid_t, ip_stack_t *);
 
 extern struct qinit iprinitv6;
 extern struct qinit ipwinitv6;
 
-extern void	conn_drain_insert(conn_t *connp);
+extern	void	conn_drain_insert(conn_t *connp);
 extern	int	conn_ipsec_length(conn_t *connp);
 extern void	ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *,
     ire_t *);
@@ -3437,17 +3465,22 @@ struct ill_zerocopy_capab_s {
 };
 
 struct ill_lso_capab_s {
-	uint_t	ill_lso_version;	/* interface version */
 	uint_t	ill_lso_on;		/* on/off switch for LSO on this ILL */
 	uint_t	ill_lso_flags;		/* capabilities */
 	uint_t	ill_lso_max;		/* maximum size of payload */
 };
 
-/* Possible ill_states */
-#define	ILL_RING_INPROC		3	/* Being assigned to squeue */
-#define	ILL_RING_INUSE		2	/* Already Assigned to Rx Ring */
-#define	ILL_RING_BEING_FREED	1	/* Being Unassigned */
-#define	ILL_RING_FREE		0	/* Available to be assigned to Ring */
+/*
+ * rr_ring_state cycles in the order shown below from RR_FREE through
+ * RR_FREE_IN_PROG and  back to RR_FREE.
+ */
+typedef enum {
+	RR_FREE,			/* Free slot */
+	RR_SQUEUE_UNBOUND,		/* Ring's squeue is unbound */
+	RR_SQUEUE_BIND_INPROG,		/* Ring's squeue bind in progress */
+	RR_SQUEUE_BOUND,		/* Ring's squeue bound to cpu */
+	RR_FREE_INPROG			/* Ring is being freed */
+} ip_ring_state_t;
 
 #define	ILL_MAX_RINGS		256	/* Max num of rx rings we can manage */
 #define	ILL_POLLING		0x01	/* Polling in use */
@@ -3457,73 +3490,92 @@ struct ill_lso_capab_s {
  * we need to duplicate the definitions here because we cannot
  * include mac/dls header files here.
  */
-typedef void	(*ip_mac_blank_t)(void *, time_t, uint_t);
-typedef void	(*ip_dld_tx_t)(void *, mblk_t *);
+typedef void	*ip_mac_tx_cookie_t;
+typedef void	(*ip_mac_intr_disable_t)(void *);
+typedef void	(*ip_mac_intr_enable_t)(void *);
+typedef void	*(*ip_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t);
+typedef	void	(*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t);
+typedef void	*(*ip_dld_callb_t)(void *, ip_flow_enable_t, void *);
+typedef int	(*ip_capab_func_t)(void *, uint_t, void *, uint_t);
 
-typedef void	(*ip_dls_chg_soft_ring_t)(void *, int);
-typedef void	(*ip_dls_bind_t)(void *, processorid_t);
-typedef void	(*ip_dls_unbind_t)(void *);
+/*
+ * POLLING README
+ * sq_get_pkts() is called to pick packets from softring in poll mode. It
+ * calls rr_rx to get the chain and process it with rr_ip_accept.
+ * rr_rx = mac_soft_ring_poll() to pick packets
+ * rr_ip_accept = ip_accept_tcp() to process packets
+ */
 
+/*
+ * XXX: With protocol, service specific squeues, they will have
+ * specific acceptor functions.
+ */
+typedef	mblk_t *(*ip_mac_rx_t)(void *, size_t);
+typedef mblk_t *(*ip_accept_t)(ill_t *, ill_rx_ring_t *,
+    squeue_t *, mblk_t *, mblk_t **, uint_t *);
+
+/*
+ * rr_intr_enable, rr_intr_disable, rr_rx_handle, rr_rx:
+ * May be accessed while in the squeue AND after checking that SQS_POLL_CAPAB
+ * is set.
+ *
+ * rr_ring_state: Protected by ill_lock.
+ */
 struct ill_rx_ring {
-	ip_mac_blank_t		rr_blank; /* Driver interrupt blanking func */
-	void			*rr_handle; /* Handle for Rx ring */
+	ip_mac_intr_disable_t	rr_intr_disable; /* Interrupt disabling func */
+	ip_mac_intr_enable_t	rr_intr_enable;	/* Interrupt enabling func */
+	void			*rr_intr_handle; /* Handle interrupt funcs */
+	ip_mac_rx_t		rr_rx;		/* Driver receive function */
+	ip_accept_t		rr_ip_accept;	/* IP accept function */
+	void			*rr_rx_handle;	/* Handle for Rx ring */
 	squeue_t		*rr_sqp; /* Squeue the ring is bound to */
-	ill_t			*rr_ill; /* back pointer to ill */
-	clock_t			rr_poll_time; /* Last lbolt polling was used */
-	uint32_t		rr_poll_state; /* polling state flags */
-	uint32_t		rr_max_blank_time; /* Max interrupt blank */
-	uint32_t		rr_min_blank_time; /* Min interrupt blank */
-	uint32_t		rr_max_pkt_cnt; /* Max pkts before interrupt */
-	uint32_t		rr_min_pkt_cnt; /* Mix pkts before interrupt */
-	uint32_t		rr_normal_blank_time; /* Normal intr freq */
-	uint32_t		rr_normal_pkt_cnt; /* Normal intr pkt cnt */
-	uint32_t		rr_ring_state; /* State of this ring */
+	ill_t			*rr_ill;	/* back pointer to ill */
+	ip_ring_state_t		rr_ring_state;	/* State of this ring */
 };
 
-struct ill_dls_capab_s {
-	ip_dld_tx_t		ill_tx;		/* Driver Tx routine */
-	void			*ill_tx_handle;	/* Driver Tx handle */
-	ip_dls_chg_soft_ring_t	ill_dls_change_status;
-						/* change soft ring fanout */
-	ip_dls_bind_t		ill_dls_bind;	/* to add CPU affinity */
-	ip_dls_unbind_t		ill_dls_unbind;	/* remove CPU affinity */
-	ill_rx_ring_t		*ill_ring_tbl; /* Ring to Sqp mapping table */
-	uint_t			ill_dls_soft_ring_cnt; /* Number of soft ring */
-	conn_t			*ill_unbind_conn; /* Conn used during unplumb */
+/*
+ * IP - DLD direct function call capability
+ * Suffixes, df - dld function, dh - dld handle,
+ * cf - client (IP) function, ch - client handle
+ */
+typedef struct ill_dld_direct_s {		/* DLD provided driver Tx */
+	ip_dld_tx_t		idd_tx_df;	/* str_mdata_fastpath_put */
+	void			*idd_tx_dh;	/* dld_str_t *dsp */
+	ip_dld_callb_t		idd_tx_cb_df;	/* mac_tx_srs_notify */
+	void			*idd_tx_cb_dh;	/* mac_client_handle_t *mch */
+} ill_dld_direct_t;
+
+/* IP - DLD polling capability */
+typedef struct ill_dld_poll_s {
+	ill_rx_ring_t		idp_ring_tbl[ILL_MAX_RINGS];
+} ill_dld_poll_t;
+
+/* Describes ill->ill_dld_capab */
+struct ill_dld_capab_s {
+	ip_capab_func_t		idc_capab_df;	/* dld_capab_func */
+	void			*idc_capab_dh;	/* dld_str_t *dsp */
+	ill_dld_direct_t	idc_direct;
+	ill_dld_poll_t		idc_poll;
 };
 
 /*
  * IP squeues exports
  */
-extern int 		ip_squeue_profile;
-extern int 		ip_squeue_bind;
 extern boolean_t 	ip_squeue_fanout;
-extern boolean_t	ip_squeue_soft_ring;
-extern uint_t		ip_threads_per_cpu;
-extern uint_t		ip_squeues_per_cpu;
-extern uint_t		ip_soft_rings_cnt;
-
-typedef struct squeue_set_s {
-	kmutex_t	sqs_lock;
-	struct squeue_s	**sqs_list;
-	int		sqs_size;
-	int		sqs_max_size;
-	processorid_t	sqs_bind;
-} squeue_set_t;
-
-#define	IP_SQUEUE_GET(hint) 						\
-	((!ip_squeue_fanout) ?	(CPU->cpu_squeue_set->sqs_list[0]) :	\
-		ip_squeue_random(hint))
 
-typedef void (*squeue_func_t)(squeue_t *, mblk_t *, sqproc_t, void *, uint8_t);
+#define	IP_SQUEUE_GET(hint) ip_squeue_random(hint)
 
 extern void ip_squeue_init(void (*)(squeue_t *));
 extern squeue_t	*ip_squeue_random(uint_t);
 extern squeue_t *ip_squeue_get(ill_rx_ring_t *);
-extern int ip_squeue_bind_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
+extern squeue_t *ip_squeue_getfree(pri_t);
+extern int ip_squeue_cpu_move(squeue_t *, processorid_t);
+extern void *ip_squeue_add_ring(ill_t *, void *);
+extern void ip_squeue_bind_ring(ill_t *, ill_rx_ring_t *, processorid_t);
+extern void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *);
+extern void ip_squeue_quiesce_ring(ill_t *, ill_rx_ring_t *);
+extern void ip_squeue_restart_ring(ill_t *, ill_rx_ring_t *);
 extern void ip_squeue_clean_all(ill_t *);
-extern void ip_soft_ring_assignment(ill_t *, ill_rx_ring_t *,
-    mblk_t *, struct mac_header_info_s *);
 
 extern void ip_resume_tcp_bind(void *, mblk_t *, void *);
 extern void tcp_wput(queue_t *, mblk_t *);
@@ -3580,6 +3632,9 @@ typedef void    (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
 #define	SQTAG_TCP_KSSL_INPUT		36
 #define	SQTAG_TCP_DROP_Q0		37
 #define	SQTAG_TCP_CONN_REQ_2		38
+#define	SQTAG_IP_INPUT_RX_RING		39
+#define	SQTAG_SQUEUE_CHANGE		40
+#define	SQTAG_CONNECT_FINISH		41
 
 #define	NOT_OVER_IP(ip_wq)	\
 	(ip_wq->q_next != NULL ||	\
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 553a975c54..90cc6a51d5 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -24,9 +24,6 @@
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/stream.h>
 #include <sys/stropts.h>
@@ -4331,8 +4328,7 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop)
 	}
 
 	mblk_setcred(mp, connp->conn_cred);
-	ip_output_options(connp, mp, q, IP_WPUT,
-	    &optinfo);
+	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
 }
 
 static boolean_t
diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c
index ecfafc5e51..091509c71e 100644
--- a/usr/src/uts/common/inet/ip/igmp.c
+++ b/usr/src/uts/common/inet/ip/igmp.c
@@ -24,8 +24,6 @@
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Internet Group Management Protocol (IGMP) routines.
  * Multicast Listener Discovery Protocol (MLD) routines.
@@ -1439,7 +1437,7 @@ igmp_timeout_handler(void *arg)
 		if (!ill_waiter_inc(ill))
 			continue;
 		rw_exit(&ipst->ips_ill_g_lock);
-		success = ipsq_enter(ill, B_TRUE);
+		success = ipsq_enter(ill, B_TRUE, NEW_OP);
 		if (success) {
 			next = igmp_timeout_handler_per_ill(ill);
 			if (next < global_next)
@@ -1682,7 +1680,7 @@ mld_timeout_handler(void *arg)
 		if (!ill_waiter_inc(ill))
 			continue;
 		rw_exit(&ipst->ips_ill_g_lock);
-		success = ipsq_enter(ill, B_TRUE);
+		success = ipsq_enter(ill, B_TRUE, NEW_OP);
 		if (success) {
 			next = mld_timeout_handler_per_ill(ill);
 			if (next < global_next)
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index 5eb9a7e1d2..b0eaa51983 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -46,6 +46,7 @@
 #include <sys/atomic.h>
 #include <sys/policy.h>
 #include <sys/priv.h>
+#include <sys/taskq.h>
 
 #include <sys/systm.h>
 #include <sys/param.h>
@@ -125,16 +126,17 @@
 #include <sys/tsol/tnet.h>
 
 #include <rpc/pmap_prot.h>
+#include <sys/squeue_impl.h>
 
 /*
  * Values for squeue switch:
- * IP_SQUEUE_ENTER_NODRAIN: squeue_enter_nodrain
- * IP_SQUEUE_ENTER: squeue_enter
- * IP_SQUEUE_FILL: squeue_fill
+ * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
+ * IP_SQUEUE_ENTER: SQ_PROCESS
+ * IP_SQUEUE_FILL: SQ_FILL
  */
 int ip_squeue_enter = 2;	/* Setable in /etc/system */
 
-squeue_func_t ip_input_proc;
+int ip_squeue_flag;
 #define	SET_BPREV_FLAG(x)	((mblk_t *)(uintptr_t)(x))
 
 /*
@@ -391,6 +393,11 @@ void (*cl_inet_idlesa)(uint8_t, uint32_t, sa_family_t, in6_addr_t,
  * gcgrp_rwlock -> ire_lock
  * gcgrp_rwlock -> gcdb_lock
  *
+ * squeue(sq_lock), flow related (ft_lock, fe_lock) locking
+ *
+ * cpu_lock --> ill_lock --> sqset_lock --> sq_lock
+ * sq_lock -> conn_lock -> QLOCK(q)
+ * ill_lock -> ft_lock -> fe_lock
  *
  * Routing/forwarding table locking notes:
  *
@@ -730,7 +737,7 @@ static boolean_t	ip_source_route_included(ipha_t *);
 static void	ip_trash_ire_reclaim_stack(ip_stack_t *);
 
 static void	ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t,
-		    zoneid_t, ip_stack_t *);
+		    zoneid_t, ip_stack_t *, conn_t *);
 static mblk_t	*ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *);
 static void	ip_wput_local_options(ipha_t *, ip_stack_t *);
 static int	ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t,
@@ -763,17 +770,13 @@ static void	ip_multirt_bad_mtu(ire_t *, uint32_t);
 static int	ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *);
 static int	ip_cgtp_filter_set(queue_t *, mblk_t *, char *,
     caddr_t, cred_t *);
-extern int	ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
-    caddr_t cp, cred_t *cr);
-extern int	ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t,
-    cred_t *);
 static int	ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
     caddr_t cp, cred_t *cr);
 static int	ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
     cred_t *);
 static int	ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t,
     cred_t *);
-static squeue_func_t ip_squeue_switch(int);
+static int	ip_squeue_switch(int);
 
 static void	*ip_kstat_init(netstackid_t, ip_stack_t *);
 static void	ip_kstat_fini(netstackid_t, kstat_t *);
@@ -790,7 +793,7 @@ static mblk_t	*ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t,
     ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *);
 
 static void	ip_rput_process_forward(queue_t *, mblk_t *, ire_t *,
-    ipha_t *, ill_t *, boolean_t);
+    ipha_t *, ill_t *, boolean_t, boolean_t);
 
 static void ipobs_init(ip_stack_t *);
 static void ipobs_fini(ip_stack_t *);
@@ -934,20 +937,14 @@ static ipndp_t	lcl_ndp_arr[] = {
 	    "ip_rput_pullups" },
 	{  ip_srcid_report,	NULL,		NULL,
 	    "ip_srcid_status" },
-	{ ip_param_generic_get, ip_squeue_profile_set,
-	    (caddr_t)&ip_squeue_profile, "ip_squeue_profile" },
-	{ ip_param_generic_get, ip_squeue_bind_set,
-	    (caddr_t)&ip_squeue_bind, "ip_squeue_bind" },
 	{ ip_param_generic_get, ip_input_proc_set,
 	    (caddr_t)&ip_squeue_enter, "ip_squeue_enter" },
 	{ ip_param_generic_get, ip_int_set,
 	    (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" },
-#define	IPNDP_CGTP_FILTER_OFFSET	11
+#define	IPNDP_CGTP_FILTER_OFFSET	9
 	{  ip_cgtp_filter_get,	ip_cgtp_filter_set, NULL,
 	    "ip_cgtp_filter" },
-	{ ip_param_generic_get, ip_int_set,
-	    (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" },
-#define	IPNDP_IPMP_HOOK_OFFSET	13
+#define	IPNDP_IPMP_HOOK_OFFSET		10
 	{  ip_param_generic_get, ipmp_hook_emulation_set, NULL,
 	    "ipmp_hook_emulation" },
 	{  ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug,
@@ -2564,8 +2561,8 @@ icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp,
 
 		/* Have to change db_type after any pullupmsg */
 		DB_TYPE(mp) = M_CTL;
-		squeue_fill(connp->conn_sqp, first_mp, tcp_input,
-		    connp, SQTAG_TCP_INPUT_ICMP_ERR);
+		SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp,
+		    SQ_FILL, SQTAG_TCP_INPUT_ICMP_ERR);
 		return;
 
 	case IPPROTO_SCTP:
@@ -5367,34 +5364,13 @@ ip_modclose(ill_t *ill)
 	ipif_t	*ipif;
 	queue_t	*q = ill->ill_rq;
 	ip_stack_t	*ipst = ill->ill_ipst;
-	clock_t timeout;
-
-	/*
-	 * Wait for the ACKs of all deferred control messages to be processed.
-	 * In particular, we wait for a potential capability reset initiated
-	 * in ip_sioctl_plink() to complete before proceeding.
-	 *
-	 * Note: we wait for at most ip_modclose_ackwait_ms (by default 3000 ms)
-	 * in case the driver never replies.
-	 */
-	timeout = lbolt + MSEC_TO_TICK(ip_modclose_ackwait_ms);
-	mutex_enter(&ill->ill_lock);
-	while (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
-		if (cv_timedwait(&ill->ill_cv, &ill->ill_lock, timeout) < 0) {
-			/* Timeout */
-			break;
-		}
-	}
-	mutex_exit(&ill->ill_lock);
 
 	/*
-	 * Forcibly enter the ipsq after some delay. This is to take
-	 * care of the case when some ioctl does not complete because
-	 * we sent a control message to the driver and it did not
-	 * send us a reply. We want to be able to at least unplumb
-	 * and replumb rather than force the user to reboot the system.
+	 * The punlink prior to this may have initiated a capability
+	 * negotiation. But ipsq_enter will block until that finishes or
+	 * times out.
 	 */
-	success = ipsq_enter(ill, B_FALSE);
+	success = ipsq_enter(ill, B_FALSE, NEW_OP);
 
 	/*
 	 * Open/close/push/pop is guaranteed to be single threaded
@@ -5661,33 +5637,6 @@ ip_conn_input(void *arg1, mblk_t *mp, void *arg2)
 	putnext(connp->conn_rq, mp);
 }
 
-/* Return the IP checksum for the IP header at "iph". */
-uint16_t
-ip_csum_hdr(ipha_t *ipha)
-{
-	uint16_t	*uph;
-	uint32_t	sum;
-	int		opt_len;
-
-	opt_len = (ipha->ipha_version_and_hdr_length & 0xF) -
-	    IP_SIMPLE_HDR_LENGTH_IN_WORDS;
-	uph = (uint16_t *)ipha;
-	sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
-	    uph[5] + uph[6] + uph[7] + uph[8] + uph[9];
-	if (opt_len > 0) {
-		do {
-			sum += uph[10];
-			sum += uph[11];
-			uph += 2;
-		} while (--opt_len);
-	}
-	sum = (sum & 0xFFFF) + (sum >> 16);
-	sum = ~(sum + (sum >> 16)) & 0xFFFF;
-	if (sum == 0xffff)
-		sum = 0;
-	return ((uint16_t)sum);
-}
-
 /*
  * Called when the module is about to be unloaded
  */
@@ -5741,6 +5690,11 @@ ip_stack_shutdown(netstackid_t stackid, void *arg)
 	 */
 	ipv4_hook_shutdown(ipst);
 	ipv6_hook_shutdown(ipst);
+
+	mutex_enter(&ipst->ips_capab_taskq_lock);
+	ipst->ips_capab_taskq_quit = B_TRUE;
+	cv_signal(&ipst->ips_capab_taskq_cv);
+	mutex_exit(&ipst->ips_capab_taskq_lock);
 }
 
 /*
@@ -5761,6 +5715,10 @@ ip_stack_fini(netstackid_t stackid, void *arg)
 	ipv6_hook_destroy(ipst);
 	ip_net_destroy(ipst);
 
+	mutex_destroy(&ipst->ips_capab_taskq_lock);
+	cv_destroy(&ipst->ips_capab_taskq_cv);
+	list_destroy(&ipst->ips_capab_taskq_list);
+
 #ifdef NS_DEBUG
 	printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
 #endif
@@ -5882,7 +5840,7 @@ ip_thread_exit(void *phash)
 void
 ip_ddi_init(void)
 {
-	ip_input_proc = ip_squeue_switch(ip_squeue_enter);
+	ip_squeue_flag = ip_squeue_switch(ip_squeue_enter);
 
 	/*
 	 * For IP and TCP the minor numbers should start from 2 since we have 4
@@ -6043,6 +6001,16 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
 	ipv4_hook_init(ipst);
 	ipv6_hook_init(ipst);
 
+	/*
+	 * Create the taskq dispatcher thread and initialize related stuff.
+	 */
+	ipst->ips_capab_taskq_thread = thread_create(NULL, 0,
+	    ill_taskq_dispatch, ipst, 0, &p0, TS_RUN, minclsyspri);
+	mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL);
+	list_create(&ipst->ips_capab_taskq_list, sizeof (mblk_t),
+	    offsetof(mblk_t, b_next));
+
 	return (ipst);
 }
 
@@ -6839,8 +6807,8 @@ ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha,
 	BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers);
 	if (IPCL_IS_TCP(connp)) {
 		/* do not drain, certain use cases can blow the stack */
-		squeue_enter_nodrain(connp->conn_sqp, first_mp,
-		    connp->conn_recv, connp, SQTAG_IP_FANOUT_TCP);
+		SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv,
+		    connp, ip_squeue_flag, SQTAG_IP_FANOUT_TCP);
 	} else {
 		/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
 		(connp->conn_recv)(connp, first_mp, NULL);
@@ -7016,9 +6984,10 @@ ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp,
 	if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) {
 		first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha,
 		    NULL, mctl_present);
+		/* Freed by ipsec_check_inbound_policy(). */
 		if (first_mp == NULL) {
 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-			return;	/* Freed by ipsec_check_inbound_policy(). */
+			return;
 		}
 	}
 	if (mctl_present)
@@ -9832,6 +9801,9 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 	netstack_rele(ipst->ips_netstack);
 
 	connp->conn_zoneid = zoneid;
+	connp->conn_sqp = NULL;
+	connp->conn_initial_sqp = NULL;
+	connp->conn_final_sqp = NULL;
 
 	connp->conn_upq = q;
 	q->q_ptr = WR(q)->q_ptr = connp;
@@ -12977,6 +12949,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
 	mblk_t		*mp1;
 	boolean_t	syn_present = B_FALSE;
 	tcph_t		*tcph;
+	uint_t		tcph_flags;
 	uint_t		ip_hdr_len;
 	ill_t		*ill = (ill_t *)q->q_ptr;
 	zoneid_t	zoneid = ire->ire_zoneid;
@@ -13121,6 +13094,9 @@ try_again:
 		goto no_conn;
 	}
 
+	tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
+	tcph_flags = tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG);
+
 	/*
 	 * TCP FAST PATH for AF_INET socket.
 	 *
@@ -13138,12 +13114,17 @@ try_again:
 	    !IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
 		ASSERT(first_mp == mp);
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-		SET_SQUEUE(mp, tcp_rput_data, connp);
+		if (tcph_flags != (TH_SYN | TH_ACK)) {
+			SET_SQUEUE(mp, tcp_rput_data, connp);
+			return (mp);
+		}
+		mp->b_datap->db_struioflag |= STRUIO_CONNECT;
+		DB_CKSUMSTART(mp) = (intptr_t)ip_squeue_get(ill_ring);
+		SET_SQUEUE(mp, tcp_input, connp);
 		return (mp);
 	}
 
-	tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
-	if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
+	if (tcph_flags == TH_SYN) {
 		if (IPCL_IS_TCP(connp)) {
 			mp->b_datap->db_struioflag |= STRUIO_EAGER;
 			DB_CKSUMSTART(mp) =
@@ -13165,7 +13146,6 @@ try_again:
 			}
 			syn_present = B_TRUE;
 		}
-
 	}
 
 	if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) {
@@ -13903,6 +13883,12 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
 	return (NULL);
 }
 
+/*
+ *
+ * This is the fast forward path. If we are here, we dont need to
+ * worry about RSVP, CGTP, or TSol. Furthermore the ftable lookup
+ * needed to find the nexthop in this case is much simpler
+ */
 ire_t *
 ip_fast_forward(ire_t *ire, ipaddr_t dst,  ill_t *ill, mblk_t *mp)
 {
@@ -13928,6 +13914,12 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst,  ill_t *ill, mblk_t *mp)
 		 */
 		ire_refrele(ire);
 		ire = ire_cache_lookup(dst, GLOBAL_ZONEID, NULL, ipst);
+		/*
+		 * ire_cache_lookup() can return ire of IRE_LOCAL in
+		 * transient cases. In such case, just drop the packet
+		 */
+		if (ire->ire_type != IRE_CACHE)
+			goto drop;
 	}
 
 	/*
@@ -13952,8 +13944,8 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst,  ill_t *ill, mblk_t *mp)
 	/* No ire cache of nexthop. So first create one  */
 	if (ire == NULL) {
 
-		ire = ire_forward(dst, &ret_action, NULL, NULL,
-		    NULL, ipst);
+		ire = ire_forward_simple(dst, &ret_action, ipst);
+
 		/*
 		 * We only come to ip_fast_forward if ip_cgtp_filter
 		 * is not set. So ire_forward() should not return with
@@ -14001,7 +13993,6 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst,  ill_t *ill, mblk_t *mp)
 	pkt_len = ntohs(ipha->ipha_length);
 	stq_ill = (ill_t *)ire->ire_stq->q_ptr;
 	if (!(stq_ill->ill_flags & ILLF_ROUTER) ||
-	    !(ill->ill_flags & ILLF_ROUTER) ||
 	    (ill == stq_ill) ||
 	    (ill->ill_group != NULL && ill->ill_group == stq_ill->ill_group) ||
 	    (ire->ire_nce == NULL) ||
@@ -14010,7 +14001,7 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst,  ill_t *ill, mblk_t *mp)
 	    ((hlen = MBLKL(fpmp)) > MBLKHEAD(mp)) ||
 	    ipha->ipha_ttl <= 1) {
 		ip_rput_process_forward(ill->ill_rq, mp, ire,
-		    ipha, ill, B_FALSE);
+		    ipha, ill, B_FALSE, B_TRUE);
 		return (ire);
 	}
 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
@@ -14048,34 +14039,33 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst,  ill_t *ill, mblk_t *mp)
 	BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutTransmits);
 	UPDATE_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutOctets, pkt_len);
 
-	dev_q = ire->ire_stq->q_next;
-	if ((dev_q->q_next != NULL || dev_q->q_first != NULL) &&
-	    !canputnext(ire->ire_stq)) {
-		goto indiscard;
+	if (!ILL_DIRECT_CAPABLE(stq_ill) || DB_TYPE(mp) != M_DATA) {
+		dev_q = ire->ire_stq->q_next;
+		if (DEV_Q_FLOW_BLOCKED(dev_q))
+			goto indiscard;
 	}
-	if (ILL_DLS_CAPABLE(stq_ill)) {
-		/*
-		 * Send the packet directly to DLD, where it
-		 * may be queued depending on the availability
-		 * of transmit resources at the media layer.
-		 */
-		IP_DLS_ILL_TX(stq_ill, ipha, mp, ipst, hlen);
-	} else {
-		DTRACE_PROBE4(ip4__physical__out__start,
-		    ill_t *, NULL, ill_t *, stq_ill,
-		    ipha_t *, ipha, mblk_t *, mp);
-		FW_HOOKS(ipst->ips_ip4_physical_out_event,
-		    ipst->ips_ipv4firewall_physical_out,
-		    NULL, stq_ill, ipha, mp, mp, 0, ipst);
-		DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
-		if (mp == NULL)
-			goto drop;
 
-		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
-		    ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha,
-		    ip6_t *, NULL, int, 0);
+	DTRACE_PROBE4(ip4__physical__out__start,
+	    ill_t *, NULL, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp);
+	FW_HOOKS(ipst->ips_ip4_physical_out_event,
+	    ipst->ips_ipv4firewall_physical_out,
+	    NULL, stq_ill, ipha, mp, mp, 0, ipst);
+	DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+	DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+	    ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha,
+	    ip6_t *, NULL, int, 0);
 
-		putnext(ire->ire_stq, mp);
+	if (mp != NULL) {
+		if (ipst->ips_ipobs_enabled) {
+			zoneid_t szone;
+
+			szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
+			    ipst, ALL_ZONES);
+			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
+			    ALL_ZONES, ill, IPV4_VERSION, hlen, ipst);
+		}
+
+		ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC);
 	}
 	return (ire);
 
@@ -14096,7 +14086,7 @@ drop:
 
 static void
 ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
-    ill_t *ill, boolean_t ll_multicast)
+    ill_t *ill, boolean_t ll_multicast, boolean_t from_ip_fast_forward)
 {
 	ill_group_t	*ill_group;
 	ill_group_t	*ire_group;
@@ -14109,6 +14099,16 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
 	mp->b_prev = NULL; /* ip_rput_noire sets incoming interface here */
 	mp->b_next = NULL; /* ip_rput_noire sets dst here */
 
+	/*
+	 * If the caller of this function is ip_fast_forward() skip the
+	 * next three checks as it does not apply.
+	 */
+	if (from_ip_fast_forward) {
+		ill_group = ill->ill_group;
+		ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group;
+		goto skip;
+	}
+
 	if (ll_multicast != 0) {
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 		goto drop_pkt;
@@ -14147,6 +14147,7 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
 	 * side-effect of that would be requiring an ire flush
 	 * whenever the ILLF_ROUTER flag changes.
 	 */
+skip:
 	if (((ill->ill_flags &
 	    ((ill_t *)ire->ire_stq->q_ptr)->ill_flags &
 	    ILLF_ROUTER) == 0) &&
@@ -14253,7 +14254,7 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
 	}
 sendit:
 	dev_q = ire->ire_stq->q_next;
-	if ((dev_q->q_next || dev_q->q_first) && !canput(dev_q)) {
+	if (DEV_Q_FLOW_BLOCKED(dev_q)) {
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 		freemsg(mp);
 		return;
@@ -14447,7 +14448,7 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha,
 			ipha->ipha_hdr_checksum = 0;
 			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
 			ip_rput_process_forward(q, mp, ire, ipha,
-			    ill, ll_multicast);
+			    ill, ll_multicast, B_FALSE);
 			ire_refrele(ire);
 			return (NULL);
 		}
@@ -14904,6 +14905,15 @@ ip_fix_dbref(ill_t *ill, mblk_t *mp)
 	return (mp1);
 }
 
+#define	ADD_TO_CHAIN(head, tail, cnt, mp) {    			\
+	if (tail != NULL)					\
+		tail->b_next = mp;				\
+	else							\
+		head = mp;					\
+	tail = mp;						\
+	cnt++;							\
+}
+
 /*
  * Direct read side procedure capable of dealing with chains. GLDv3 based
  * drivers call this function directly with mblk chains while STREAMS
@@ -14942,20 +14952,23 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 	mblk_t 			*head = NULL;
 	mblk_t			*tail = NULL;
 	mblk_t			*first_mp;
-	mblk_t 			*mp;
-	mblk_t			*dmp;
 	int			cnt = 0;
 	ip_stack_t		*ipst = ill->ill_ipst;
+	mblk_t			*mp;
+	mblk_t			*dmp;
+	uint8_t			tag;
 
 	ASSERT(mp_chain != NULL);
 	ASSERT(ill != NULL);
 
 	TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q);
 
+	tag = (ip_ring != NULL) ? SQTAG_IP_INPUT_RX_RING : SQTAG_IP_INPUT;
+
 #define	rptr	((uchar_t *)ipha)
 
 	while (mp_chain != NULL) {
-		first_mp = mp = mp_chain;
+		mp = mp_chain;
 		mp_chain = mp_chain->b_next;
 		mp->b_next = NULL;
 		ll_multicast = 0;
@@ -14987,6 +15000,15 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 		 * Given the above assumption, there is no need to walk
 		 * down the entire mblk chain (which could have a
 		 * potential performance problem)
+		 *
+		 * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
+		 * to here because of exclusive ip stacks and vnics.
+		 * Packets transmitted from exclusive stack over vnic
+		 * can have db_ref > 1 and when it gets looped back to
+		 * another vnic in a different zone, you have ip_input()
+		 * getting dblks with db_ref > 1. So if someone
+		 * complains of TCP performance under this scenario,
+		 * take a serious look here on the impact of copymsg().
 		 */
 
 		if (DB_REF(mp) > 1) {
@@ -15056,7 +15078,7 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 			}
 		}
 
-		/* Make sure its an M_DATA and that its aligned */
+		/* Only M_DATA can come here and it is always aligned */
 		ASSERT(DB_TYPE(mp) == M_DATA);
 		ASSERT(DB_REF(mp) == 1 && OK_32PTR(mp->b_rptr));
 
@@ -15140,7 +15162,6 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 			continue;
 		}
 		dst = ipha->ipha_dst;
-
 		/*
 		 * Attach any necessary label information to
 		 * this packet
@@ -15194,16 +15215,18 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 		    opt_len == 0 && ipha->ipha_protocol != IPPROTO_RSVP &&
 		    !ll_multicast && !CLASSD(dst) && ill->ill_dhcpinit == 0) {
 			if (ire == NULL)
-				ire = ire_cache_lookup(dst, ALL_ZONES, NULL,
-				    ipst);
-
-			/* incoming packet is for forwarding */
-			if (ire == NULL || (ire->ire_type & IRE_CACHE)) {
+				ire = ire_cache_lookup_simple(dst, ipst);
+			/*
+			 * Unless forwarding is enabled, dont call
+			 * ip_fast_forward(). Incoming packet is for forwarding
+			 */
+			if ((ill->ill_flags & ILLF_ROUTER) &&
+			    (ire == NULL || (ire->ire_type & IRE_CACHE))) {
 				ire = ip_fast_forward(ire, dst, ill, mp);
 				continue;
 			}
 			/* incoming packet is for local consumption */
-			if (ire->ire_type & IRE_LOCAL)
+			if ((ire != NULL) && (ire->ire_type & IRE_LOCAL))
 				goto local;
 		}
 
@@ -15363,7 +15386,7 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 		} else if (ire->ire_stq != NULL) {
 			/* fowarding? */
 			ip_rput_process_forward(q, mp, ire, ipha, ill,
-			    ll_multicast);
+			    ll_multicast, B_FALSE);
 			/* ip_rput_process_forward consumed the packet */
 			continue;
 		}
@@ -15414,8 +15437,8 @@ local:
 					 * changes.
 					 */
 					IP_STAT(ipst, ip_input_multi_squeue);
-					squeue_enter_chain(curr_sqp, head,
-					    tail, cnt, SQTAG_IP_INPUT);
+					SQUEUE_ENTER(curr_sqp, head,
+					    tail, cnt, SQ_PROCESS, tag);
 					curr_sqp = GET_SQUEUE(mp);
 					head = mp;
 					tail = mp;
@@ -15444,33 +15467,231 @@ local:
 		ire_refrele(ire);
 
 	if (head != NULL)
-		squeue_enter_chain(curr_sqp, head, tail, cnt, SQTAG_IP_INPUT);
+		SQUEUE_ENTER(curr_sqp, head, tail, cnt, SQ_PROCESS, tag);
 
-	/*
-	 * This code is there just to make netperf/ttcp look good.
-	 *
-	 * Its possible that after being in polling mode (and having cleared
-	 * the backlog), squeues have turned the interrupt frequency higher
-	 * to improve latency at the expense of more CPU utilization (less
-	 * packets per interrupts or more number of interrupts). Workloads
-	 * like ttcp/netperf do manage to tickle polling once in a while
-	 * but for the remaining time, stay in higher interrupt mode since
-	 * their packet arrival rate is pretty uniform and this shows up
-	 * as higher CPU utilization. Since people care about CPU utilization
-	 * while running netperf/ttcp, turn the interrupt frequency back to
-	 * normal/default if polling has not been used in ip_poll_normal_ticks.
-	 */
-	if (ip_ring != NULL && (ip_ring->rr_poll_state & ILL_POLLING)) {
-		if (lbolt >= (ip_ring->rr_poll_time + ip_poll_normal_ticks)) {
-			ip_ring->rr_poll_state &= ~ILL_POLLING;
-			ip_ring->rr_blank(ip_ring->rr_handle,
-			    ip_ring->rr_normal_blank_time,
-			    ip_ring->rr_normal_pkt_cnt);
+	TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
+	    "ip_input_end: q %p (%S)", q, "end");
+#undef  rptr
+}
+
+/*
+ * ip_accept_tcp() - This function is called by the squeue when it retrieves
+ * a chain of packets in the poll mode. The packets have gone through the
+ * data link processing but not IP processing. For performance and latency
+ * reasons, the squeue wants to process the chain in line instead of feeding
+ * it back via ip_input path.
+ *
+ * So this is a light weight function which checks to see if the packets
+ * retrived are indeed TCP packets (TCP squeue always polls TCP soft ring
+ * but we still do the paranoid check) meant for local machine and we don't
+ * have labels etc enabled. Packets that meet the criterion are returned to
+ * the squeue and processed inline while the rest go via ip_input path.
+ */
+/*ARGSUSED*/
+mblk_t *
+ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
+    mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
+{
+	mblk_t 		*mp;
+	ipaddr_t	dst = NULL;
+	ipaddr_t	prev_dst;
+	ire_t		*ire = NULL;
+	ipha_t		*ipha;
+	uint_t		pkt_len;
+	ssize_t		len;
+	uint_t		opt_len;
+	queue_t		*q = ill->ill_rq;
+	squeue_t	*curr_sqp;
+	mblk_t 		*ahead = NULL;	/* Accepted head */
+	mblk_t		*atail = NULL;	/* Accepted tail */
+	uint_t		acnt = 0;	/* Accepted count */
+	mblk_t		*utail = NULL;	/* Unaccepted head */
+	mblk_t		*uhead = NULL;	/* Unaccepted tail */
+	uint_t		ucnt = 0;	/* Unaccepted cnt */
+	ip_stack_t	*ipst = ill->ill_ipst;
+
+	*cnt = 0;
+
+	ASSERT(ill != NULL);
+	ASSERT(ip_ring != NULL);
+
+	TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_accept_tcp: q %p", q);
+
+#define	rptr	((uchar_t *)ipha)
+
+	while (mp_chain != NULL) {
+		mp = mp_chain;
+		mp_chain = mp_chain->b_next;
+		mp->b_next = NULL;
+
+		/*
+		 * We do ire caching from one iteration to
+		 * another. In the event the packet chain contains
+		 * all packets from the same dst, this caching saves
+		 * an ire_cache_lookup for each of the succeeding
+		 * packets in a packet chain.
+		 */
+		prev_dst = dst;
+
+		ipha = (ipha_t *)mp->b_rptr;
+		len = mp->b_wptr - rptr;
+
+		ASSERT(!MBLK_RX_FANOUT_SLOWPATH(mp, ipha));
+
+		/*
+		 * If it is a non TCP packet, or doesn't have H/W cksum,
+		 * or doesn't have min len, reject.
+		 */
+		if ((ipha->ipha_protocol != IPPROTO_TCP) || (len <
+		    (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH))) {
+			ADD_TO_CHAIN(uhead, utail, ucnt, mp);
+			continue;
 		}
+
+		pkt_len = ntohs(ipha->ipha_length);
+		if (len != pkt_len) {
+			if (len > pkt_len) {
+				mp->b_wptr = rptr + pkt_len;
+			} else {
+				ADD_TO_CHAIN(uhead, utail, ucnt, mp);
+				continue;
+			}
 		}
 
-	TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
-	    "ip_input_end: q %p (%S)", q, "end");
+		opt_len = ipha->ipha_version_and_hdr_length -
+		    IP_SIMPLE_HDR_VERSION;
+		dst = ipha->ipha_dst;
+
+		/* IP version bad or there are IP options */
+		if (opt_len && (!ip_rput_multimblk_ipoptions(q, ill,
+		    mp, &ipha, &dst, ipst)))
+			continue;
+
+		if (is_system_labeled() || (ill->ill_dhcpinit != 0) ||
+		    (ipst->ips_ip_cgtp_filter &&
+		    ipst->ips_ip_cgtp_filter_ops != NULL)) {
+			ADD_TO_CHAIN(uhead, utail, ucnt, mp);
+			continue;
+		}
+
+		/*
+		 * Reuse the cached ire only if the ipha_dst of the previous
+		 * packet is the same as the current packet AND it is not
+		 * INADDR_ANY.
+		 */
+		if (!(dst == prev_dst && dst != INADDR_ANY) &&
+		    (ire != NULL)) {
+			ire_refrele(ire);
+			ire = NULL;
+		}
+
+		if (ire == NULL)
+			ire = ire_cache_lookup_simple(dst, ipst);
+
+		/*
+		 * Unless forwarding is enabled, dont call
+		 * ip_fast_forward(). Incoming packet is for forwarding
+		 */
+		if ((ill->ill_flags & ILLF_ROUTER) &&
+		    (ire == NULL || (ire->ire_type & IRE_CACHE))) {
+
+			DTRACE_PROBE4(ip4__physical__in__start,
+			    ill_t *, ill, ill_t *, NULL,
+			    ipha_t *, ipha, mblk_t *, mp);
+
+			FW_HOOKS(ipst->ips_ip4_physical_in_event,
+			    ipst->ips_ipv4firewall_physical_in,
+			    ill, NULL, ipha, mp, mp, 0, ipst);
+
+			DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
+
+			BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+			UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
+			    pkt_len);
+
+			ire = ip_fast_forward(ire, dst, ill, mp);
+			continue;
+		}
+
+		/* incoming packet is for local consumption */
+		if ((ire != NULL) && (ire->ire_type & IRE_LOCAL))
+			goto local_accept;
+
+		/*
+		 * Disable ire caching for anything more complex
+		 * than the simple fast path case we checked for above.
+		 */
+		if (ire != NULL) {
+			ire_refrele(ire);
+			ire = NULL;
+		}
+
+		ire = ire_cache_lookup(dst, ALL_ZONES, MBLK_GETLABEL(mp),
+		    ipst);
+		if (ire == NULL || ire->ire_type == IRE_BROADCAST ||
+		    ire->ire_stq != NULL) {
+			ADD_TO_CHAIN(uhead, utail, ucnt, mp);
+			if (ire != NULL) {
+				ire_refrele(ire);
+				ire = NULL;
+			}
+			continue;
+		}
+
+local_accept:
+
+		if (ire->ire_rfq != q) {
+			ADD_TO_CHAIN(uhead, utail, ucnt, mp);
+			if (ire != NULL) {
+				ire_refrele(ire);
+				ire = NULL;
+			}
+			continue;
+		}
+
+		/*
+		 * The event for packets being received from a 'physical'
+		 * interface is placed after validation of the source and/or
+		 * destination address as being local so that packets can be
+		 * redirected to loopback addresses using ipnat.
+		 */
+		DTRACE_PROBE4(ip4__physical__in__start,
+		    ill_t *, ill, ill_t *, NULL,
+		    ipha_t *, ipha, mblk_t *, mp);
+
+		FW_HOOKS(ipst->ips_ip4_physical_in_event,
+		    ipst->ips_ipv4firewall_physical_in,
+		    ill, NULL, ipha, mp, mp, 0, ipst);
+
+		DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
+
+		BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+		UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len);
+
+		if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, mp,
+		    0, q, ip_ring)) != NULL) {
+			if ((curr_sqp = GET_SQUEUE(mp)) == target_sqp) {
+				ADD_TO_CHAIN(ahead, atail, acnt, mp);
+			} else {
+				SQUEUE_ENTER(curr_sqp, mp, mp, 1,
+				    SQ_FILL, SQTAG_IP_INPUT);
+			}
+		}
+	}
+
+	if (ire != NULL)
+		ire_refrele(ire);
+
+	if (uhead != NULL)
+		ip_input(ill, ip_ring, uhead, NULL);
+
+	if (ahead != NULL) {
+		*last = atail;
+		*cnt = acnt;
+		return (ahead);
+	}
+
+	return (NULL);
 #undef  rptr
 }
 
@@ -15770,11 +15991,18 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			}
 			freemsg(mp);	/* Don't want to pass this up */
 			return;
-
-		case DL_CAPABILITY_REQ:
 		case DL_CONTROL_REQ:
+			ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
+			    "DL_CONTROL_REQ\n"));
 			ill_dlpi_done(ill, dlea->dl_error_primitive);
-			ill->ill_dlpi_capab_state = IDS_FAILED;
+			freemsg(mp);
+			return;
+		case DL_CAPABILITY_REQ:
+			ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
+			    "DL_CAPABILITY REQ\n"));
+			if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
+				ill->ill_dlpi_capab_state = IDCS_FAILED;
+			ill_capability_done(ill);
 			freemsg(mp);
 			return;
 		}
@@ -15814,19 +16042,14 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			    dlea->dl_errno, dlea->dl_unix_errno);
 		break;
 	case DL_CAPABILITY_ACK:
-		/* Call a routine to handle this one. */
-		ill_dlpi_done(ill, DL_CAPABILITY_REQ);
 		ill_capability_ack(ill, mp);
-
 		/*
-		 * If the ack is due to renegotiation, we will need to send
-		 * a new CAPABILITY_REQ to start the renegotiation.
+		 * The message has been handed off to ill_capability_ack
+		 * and must not be freed below
 		 */
-		if (ill->ill_capab_reneg) {
-			ill->ill_capab_reneg = B_FALSE;
-			ill_capability_probe(ill);
-		}
+		mp = NULL;
 		break;
+
 	case DL_CONTROL_ACK:
 		/* We treat all of these as "fire and forget" */
 		ill_dlpi_done(ill, DL_CONTROL_REQ);
@@ -16117,10 +16340,9 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			 * and the renegotiation has not been started yet;
 			 * nothing needs to be done in this case.
 			 */
-			if (ill->ill_dlpi_capab_state != IDS_UNKNOWN) {
-				ill_capability_reset(ill);
-				ill->ill_capab_reneg = B_TRUE;
-			}
+			ipsq_current_start(ipsq, ill->ill_ipif, 0);
+			ill_capability_reset(ill, B_TRUE);
+			ipsq_current_finish(ipsq);
 			break;
 		default:
 			ip0dbg(("ip_rput_dlpi_writer: unknown notification "
@@ -16661,7 +16883,8 @@ ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill)
 				max_frag -= secopt_size;
 		}
 
-		ip_wput_frag(ire, mp, IB_PKT, max_frag, 0, GLOBAL_ZONEID, ipst);
+		ip_wput_frag(ire, mp, IB_PKT, max_frag, 0,
+		    GLOBAL_ZONEID, ipst, NULL);
 		ip2dbg(("ip_rput_forward:sent to ip_wput_frag\n"));
 		return;
 	}
@@ -16677,7 +16900,7 @@ ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill)
 
 	mp->b_prev = (mblk_t *)IPP_FWD_OUT;
 	ip1dbg(("ip_rput_forward: Calling ip_xmit_v4\n"));
-	(void) ip_xmit_v4(mp, ire, NULL, B_FALSE);
+	(void) ip_xmit_v4(mp, ire, NULL, B_FALSE, NULL);
 	/* ip_xmit_v4 always consumes the packet */
 	return;
 
@@ -17049,9 +17272,12 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire)
 				mp = ip_tcp_input(mp, ipha, ill, B_TRUE,
 				    ire, ipsec_mp, 0, ill->ill_rq, NULL);
 				IRE_REFRELE(ire);
-				if (mp != NULL)
-					squeue_enter_chain(GET_SQUEUE(mp), mp,
-					    mp, 1, SQTAG_IP_PROTO_AGAIN);
+				if (mp != NULL) {
+
+					SQUEUE_ENTER(GET_SQUEUE(mp), mp,
+					    mp, 1, SQ_PROCESS,
+					    SQTAG_IP_PROTO_AGAIN);
+				}
 				break;
 			case IPPROTO_SCTP:
 				if (!ire_need_rele)
@@ -21721,7 +21947,7 @@ conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif)
  */
 static void
 ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid,
-    ip_stack_t *ipst)
+    ip_stack_t *ipst, conn_t *connp)
 {
 	ipha_t		*ipha;
 	mblk_t		*mp;
@@ -21779,7 +22005,7 @@ ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid,
 	    ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst));
 
 	ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag,
-	    (dont_use ? 0 : frag_flag), zoneid, ipst);
+	    (dont_use ? 0 : frag_flag), zoneid, ipst, connp);
 }
 
 /*
@@ -22502,9 +22728,9 @@ another:;
 		queue_t *dev_q = stq->q_next;
 
 		/* flow controlled */
-		if ((dev_q->q_next || dev_q->q_first) &&
-		    !canput(dev_q))
+		if (DEV_Q_FLOW_BLOCKED(dev_q))
 			goto blocked;
+
 		if ((PROTO == IPPROTO_UDP) &&
 		    (ip_hdr_included != IP_HDR_INCLUDED)) {
 			hlen = (V_HLEN & 0xF) << 2;
@@ -22685,6 +22911,7 @@ another:;
 		    ipst->ips_ipv4firewall_physical_out,
 		    NULL, ire->ire_ipif->ipif_ill, ipha, mp, mp, 0, ipst);
 		DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+
 		if (mp == NULL)
 			goto release_ire_and_ill;
 
@@ -22703,7 +22930,9 @@ another:;
 		}
 		mp->b_prev = SET_BPREV_FLAG(IPP_LOCAL_OUT);
 		DTRACE_PROBE2(ip__xmit__1, mblk_t *, mp, ire_t *, ire);
-		pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE);
+
+		pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE, connp);
+
 		if ((pktxmit_state == SEND_FAILED) ||
 		    (pktxmit_state == LLHDR_RESLV_FAILED)) {
 			ip2dbg(("ip_wput_ire: ip_xmit_v4 failed"
@@ -22976,10 +23205,9 @@ broadcast:
 #endif
 				sctph->sh_chksum = sctp_cksum(mp, hlen);
 		} else {
-			queue_t *dev_q = stq->q_next;
+			queue_t	*dev_q = stq->q_next;
 
-			if ((dev_q->q_next || dev_q->q_first) &&
-			    !canput(dev_q)) {
+			if (DEV_Q_FLOW_BLOCKED(dev_q)) {
 blocked:
 				ipha->ipha_ident = ip_hdr_included;
 				/*
@@ -23314,7 +23542,7 @@ checksumoptions:
 				DTRACE_PROBE2(ip__xmit__2,
 				    mblk_t *, mp, ire_t *, ire);
 				pktxmit_state = ip_xmit_v4(mp, ire,
-				    NULL, B_TRUE);
+				    NULL, B_TRUE, connp);
 				if ((pktxmit_state == SEND_FAILED) ||
 				    (pktxmit_state == LLHDR_RESLV_FAILED)) {
 release_ire_and_ill_2:
@@ -23471,13 +23699,14 @@ fragmentit:
 					    "ip_wput_ire_end: q %p (%S)",
 					    q, "last fragmentation");
 					ip_wput_ire_fragmentit(mp, ire,
-					    zoneid, ipst);
+					    zoneid, ipst, connp);
 					ire_refrele(ire);
 					if (conn_outgoing_ill != NULL)
 						ill_refrele(conn_outgoing_ill);
 					return;
 				}
-				ip_wput_ire_fragmentit(mp, ire, zoneid, ipst);
+				ip_wput_ire_fragmentit(mp, ire,
+				    zoneid, ipst, connp);
 			}
 		}
 	} else {
@@ -24195,7 +24424,7 @@ pbuf_panic:
  */
 static void
 ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
-    uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst)
+    uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst, conn_t *connp)
 {
 	int		i1;
 	mblk_t		*ll_hdr_mp;
@@ -24253,7 +24482,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
 	 */
 	if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) {
 		/* If nce_state is ND_INITIAL, trigger ARP query */
-		(void) ip_xmit_v4(NULL, ire, NULL, B_FALSE);
+		(void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL);
 		ip1dbg(("ip_wput_frag: mac address for ire is unresolved"
 		    " -  dropping packet\n"));
 		BUMP_MIB(mibptr, ipIfStatsOutFragFails);
@@ -24622,7 +24851,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
 			    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill,
 			    ipha_t *, ipha, ip6_t *, NULL, int, 0);
 
-			putnext(q, xmit_mp);
+			ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0);
 
 			BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits);
 			UPDATE_MIB(out_ill->ill_ip_mib,
@@ -24932,7 +25161,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
 				    __dtrace_ipsr_ill_t *, out_ill, ipha_t *,
 				    ipha, ip6_t *, NULL, int, 0);
 
-				putnext(q, xmit_mp);
+				ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0);
 
 				BUMP_MIB(out_ill->ill_ip_mib,
 				    ipIfStatsHCOutTransmits);
@@ -26286,7 +26515,8 @@ send:
 			    "fragmented accelerated packet!\n"));
 			freemsg(ipsec_mp);
 		} else {
-			ip_wput_ire_fragmentit(ipsec_mp, ire, zoneid, ipst);
+			ip_wput_ire_fragmentit(ipsec_mp, ire,
+			    zoneid, ipst, NULL);
 		}
 		if (ire_need_rele)
 			ire_refrele(ire);
@@ -26461,7 +26691,7 @@ send:
 			 * Call ip_xmit_v4() to trigger ARP query
 			 * in case the nce_state is ND_INITIAL
 			 */
-			(void) ip_xmit_v4(NULL, ire, NULL, B_FALSE);
+			(void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL);
 			goto drop_pkt;
 		}
 
@@ -26477,7 +26707,7 @@ send:
 
 		ip1dbg(("ip_wput_ipsec_out: calling ip_xmit_v4\n"));
 		pktxmit_state = ip_xmit_v4(mp, ire,
-		    (io->ipsec_out_accelerated ? io : NULL), B_FALSE);
+		    (io->ipsec_out_accelerated ? io : NULL), B_FALSE, NULL);
 
 		if ((pktxmit_state ==  SEND_FAILED) ||
 		    (pktxmit_state == LLHDR_RESLV_FAILED)) {
@@ -27588,9 +27818,9 @@ nak:
 				 */
 				ASSERT(ipsq != NULL);
 				CONN_INC_REF(connp);
-				squeue_fill(connp->conn_sqp, mp,
+				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
 				    ip_resume_tcp_bind, connp,
-				    SQTAG_BIND_RETRY);
+				    SQ_FILL, SQTAG_BIND_RETRY);
 			} else if (IPCL_IS_UDP(connp)) {
 				/*
 				 * In the case of UDP endpoint we
@@ -28053,7 +28283,7 @@ nak:
 		/*
 		 * send out queued packets.
 		 */
-		(void) ip_xmit_v4(NULL, ire, NULL, B_FALSE);
+		(void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL);
 
 		IRE_REFRELE(ire);
 		return;
@@ -28558,6 +28788,25 @@ ip_wsrv(queue_t *q)
 }
 
 /*
+ * Callback to disable flow control in IP.
+ *
+ * This is a mac client callback added when the DLD_CAPAB_DIRECT capability
+ * is enabled.
+ *
+ * When MAC_TX() is not able to send any more packets, dld sets its queue
+ * to QFULL and enable the STREAMS flow control. Later, when the underlying
+ * driver is able to continue to send packets, it calls mac_tx_(ring_)update()
+ * function and wakes up corresponding mac worker threads, which in turn
+ * calls this callback function, and disables flow control.
+ */
+/* ARGSUSED */
+void
+ill_flow_enable(void *ill, ip_mac_tx_cookie_t cookie)
+{
+	qenable(((ill_t *)ill)->ill_wq);
+}
+
+/*
  * Walk the list of all conn's calling the function provided with the
  * specified argument for each.	 Note that this only walks conn's that
  * have been bound.
@@ -29280,17 +29529,17 @@ ip_cgtp_filter_is_registered(netstackid_t stackid)
 	return (ret);
 }
 
-static squeue_func_t
+static int
 ip_squeue_switch(int val)
 {
-	squeue_func_t rval = squeue_fill;
+	int rval = SQ_FILL;
 
 	switch (val) {
 	case IP_SQUEUE_ENTER_NODRAIN:
-		rval = squeue_enter_nodrain;
+		rval = SQ_NODRAIN;
 		break;
 	case IP_SQUEUE_ENTER:
-		rval = squeue_enter;
+		rval = SQ_PROCESS;
 		break;
 	default:
 		break;
@@ -29312,7 +29561,7 @@ ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
 		return (EINVAL);
 
-	ip_input_proc = ip_squeue_switch(new_value);
+	ip_squeue_flag = ip_squeue_switch(new_value);
 	*v = new_value;
 	return (0);
 }
@@ -29983,7 +30232,8 @@ ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4,
  *	  ip_wput_frag can call this function.
  */
 ipxmit_state_t
-ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, boolean_t flow_ctl_enabled)
+ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io,
+    boolean_t flow_ctl_enabled, conn_t *connp)
 {
 	nce_t		*arpce;
 	ipha_t		*ipha;
@@ -30069,7 +30319,8 @@ ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, boolean_t flow_ctl_enabled)
 					    ipha_t *, ipha, ip6_t *, NULL, int,
 					    0);
 
-					putnext(q, first_mp);
+					ILL_SEND_TX(out_ill,
+					    ire, connp, first_mp, 0);
 				} else {
 					BUMP_MIB(out_ill->ill_ip_mib,
 					    ipIfStatsOutDiscards);
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index 810cec9e8a..a1d97627b2 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -98,6 +98,7 @@
 #include <inet/udp_impl.h>
 #include <inet/rawip_impl.h>
 #include <inet/rts_impl.h>
+#include <sys/squeue_impl.h>
 #include <sys/squeue.h>
 
 #include <sys/tsol/label.h>
@@ -108,7 +109,7 @@
 /* Temporary; for CR 6451644 work-around */
 #include <sys/ethernet.h>
 
-extern squeue_func_t ip_input_proc;
+extern int ip_squeue_flag;
 
 /*
  * Naming conventions:
@@ -887,8 +888,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
 			goto drop_pkt;
 		}
 
-		squeue_fill(connp->conn_sqp, first_mp, tcp_input,
-		    connp, SQTAG_TCP6_INPUT_ICMP_ERR);
+		SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp,
+		    SQ_FILL, SQTAG_TCP6_INPUT_ICMP_ERR);
 		return;
 
 	}
@@ -2538,8 +2539,9 @@ ip_bind_connected_resume_v6(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
 	if (mp != NULL) {
 		if (IPCL_IS_TCP(connp)) {
 			CONN_INC_REF(connp);
-			squeue_fill(connp->conn_sqp, mp, ip_resume_tcp_bind,
-			    connp, SQTAG_TCP_RPUTOTHER);
+			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+			    ip_resume_tcp_bind, connp, SQ_FILL,
+			    SQTAG_TCP_RPUTOTHER);
 		} else if (IPCL_IS_UDP(connp)) {
 			udp_resume_bind(connp, mp);
 		} else {
@@ -3637,8 +3639,8 @@ ip_fanout_tcp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, ill_t *inill,
 
 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
 	if (IPCL_IS_TCP(connp)) {
-		(*ip_input_proc)(connp->conn_sqp, first_mp,
-		    connp->conn_recv, connp, SQTAG_IP6_TCP_INPUT);
+		SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv,
+		    connp, ip_squeue_flag, SQTAG_IP6_TCP_INPUT);
 	} else {
 		/* SOCK_RAW, IPPROTO_TCP case */
 		(connp->conn_recv)(connp, first_mp, NULL);
@@ -11072,7 +11074,7 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
 
 		/* Driver is flow-controlling? */
 		if (!IP_FLOW_CONTROLLED_ULP(nexthdr) &&
-		    ((dev_q->q_next || dev_q->q_first) && !canput(dev_q))) {
+		    DEV_Q_FLOW_BLOCKED(dev_q)) {
 			/*
 			 * Queue packet if we have an conn to give back
 			 * pressure.  We can't queue packets intended for
@@ -12140,8 +12142,9 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
 				    "connp %p (ENOMEM)\n", (void *)connp));
 			} else {
 				CONN_INC_REF(connp);
-				squeue_fill(connp->conn_sqp, mdimp, tcp_input,
-				    connp, SQTAG_TCP_INPUT_MCTL);
+				SQUEUE_ENTER_ONE(connp->conn_sqp, mdimp,
+				    tcp_input, connp, SQ_FILL,
+				    SQTAG_TCP_INPUT_MCTL);
 			}
 		}
 
@@ -12576,34 +12579,8 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
 		}
 	} else {
 		/*
-		 * Queue packet if we have an conn to give back pressure.
-		 * We can't queue packets intended for hardware acceleration
-		 * since we've tossed that state already. If the packet is
-		 * being fed back from ire_send_v6, we don't know the
-		 * position in the queue to enqueue the packet and we discard
-		 * the packet.
-		 */
-		if (ipst->ips_ip_output_queue && (connp != NULL) &&
-		    (io == NULL) && (caller != IRE_SEND)) {
-			if (caller == IP_WSRV) {
-				connp->conn_did_putbq = 1;
-				(void) putbq(connp->conn_wq, mp);
-				conn_drain_insert(connp);
-				/*
-				 * caller == IP_WSRV implies we are
-				 * the service thread, and the
-				 * queue is already noenabled.
-				 * The check for canput and
-				 * the putbq is not atomic.
-				 * So we need to check again.
-				 */
-				if (canput(stq->q_next))
-					connp->conn_did_putbq = 0;
-			} else {
-				(void) putq(connp->conn_wq, mp);
-			}
-			return;
-		}
+		 * Can't apply backpressure, just discard the packet.
+		 */
 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
 		freemsg(mp);
 		return;
diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c
index c87267cb29..4fa3c7a74d 100644
--- a/usr/src/uts/common/inet/ip/ip_ftable.c
+++ b/usr/src/uts/common/inet/ip/ip_ftable.c
@@ -101,6 +101,8 @@ static ire_t   	*ire_round_robin(irb_t *, zoneid_t, ire_ftable_args_t *,
 static void		ire_del_host_redir(ire_t *, char *);
 static boolean_t	ire_find_best_route(struct radix_node *, void *);
 static int	ip_send_align_hcksum_flags(mblk_t *, ill_t *);
+static ire_t	*ire_ftable_lookup_simple(ipaddr_t,
+	ire_t **, zoneid_t,  int, ip_stack_t *);
 
 /*
  * Lookup a route in forwarding table. A specific lookup is indicated by
@@ -406,6 +408,157 @@ found_ire_held:
 	return (ire);
 }
 
+/*
+ * This function is called by
+ * ip_fast_forward->ire_forward_simple
+ * The optimizations of this function over ire_ftable_lookup are:
+ *	o removing unnecessary flag matching
+ *	o doing longest prefix match instead of overloading it further
+ *	  with the unnecessary "best_prefix_match"
+ *	o Does not do round robin of default route for every packet
+ *	o inlines code of ire_ctable_lookup to look for nexthop cache
+ *	  entry before calling ire_route_lookup
+ */
+static ire_t *
+ire_ftable_lookup_simple(ipaddr_t addr,
+    ire_t **pire, zoneid_t zoneid, int flags,
+    ip_stack_t *ipst)
+{
+	ire_t *ire = NULL;
+	ire_t *tmp_ire = NULL;
+	struct rt_sockaddr rdst;
+	struct rt_entry *rt;
+	irb_t *irb_ptr;
+	ire_t *save_ire;
+	int match_flags;
+
+	rdst.rt_sin_len = sizeof (rdst);
+	rdst.rt_sin_family = AF_INET;
+	rdst.rt_sin_addr.s_addr = addr;
+
+	/*
+	 * This is basically inlining  a simpler version of ire_match_args
+	 */
+	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
+
+	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
+	    ipst->ips_ip_ftable, NULL, NULL);
+
+	if (rt == NULL) {
+		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+		return (NULL);
+	}
+	irb_ptr = &rt->rt_irb;
+	if (irb_ptr == NULL || irb_ptr->irb_ire_cnt == 0) {
+		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+		return (NULL);
+	}
+
+	rw_enter(&irb_ptr->irb_lock, RW_READER);
+	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
+		if (ire->ire_zoneid == zoneid)
+			break;
+	}
+
+	if (ire == NULL || (ire->ire_marks & IRE_MARK_CONDEMNED)) {
+		rw_exit(&irb_ptr->irb_lock);
+		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+		return (NULL);
+	}
+	/* we have a ire that matches */
+	if (ire != NULL)
+		IRE_REFHOLD(ire);
+	rw_exit(&irb_ptr->irb_lock);
+	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+
+	if ((flags & MATCH_IRE_RJ_BHOLE) &&
+	    (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
+		return (ire);
+	}
+	/*
+	 * At this point, IRE that was found must be an IRE_FORWARDTABLE
+	 * type.  If this is a recursive lookup and an IRE_INTERFACE type was
+	 * found, return that.  If it was some other IRE_FORWARDTABLE type of
+	 * IRE (one of the prefix types), then it is necessary to fill in the
+	 * parent IRE pointed to by pire, and then lookup the gateway address of
+	 * the parent.  For backwards compatiblity, if this lookup returns an
+	 * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
+	 * of lookup is done.
+	 */
+	match_flags = MATCH_IRE_DSTONLY;
+
+	if (ire->ire_type & IRE_INTERFACE)
+		return (ire);
+	*pire = ire;
+	/*
+	 * If we can't find an IRE_INTERFACE or the caller has not
+	 * asked for pire, we need to REFRELE the save_ire.
+	 */
+	save_ire = ire;
+
+	/*
+	 * Currently MATCH_IRE_ILL is never used with
+	 * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
+	 * sending out packets as MATCH_IRE_ILL is used only
+	 * for communicating with on-link hosts. We can't assert
+	 * that here as RTM_GET calls this function with
+	 * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
+	 * We have already used the MATCH_IRE_ILL in determining
+	 * the right prefix route at this point. To match the
+	 * behavior of how we locate routes while sending out
+	 * packets, we don't want to use MATCH_IRE_ILL below
+	 * while locating the interface route.
+	 *
+	 * ire_ftable_lookup may end up with an incomplete IRE_CACHE
+	 * entry for the gateway (i.e., one for which the
+	 * ire_nce->nce_state is not yet ND_REACHABLE). If the caller
+	 * has specified MATCH_IRE_COMPLETE, such entries will not
+	 * be returned; instead, we return the IF_RESOLVER ire.
+	 */
+
+	if (ire->ire_ipif == NULL) {
+		tmp_ire = ire;
+		/*
+		 * Look to see if the nexthop entry is in the
+		 * cachetable (I am inlining a simpler ire_cache_lookup
+		 * here).
+		 */
+		ire = ire_cache_lookup_simple(ire->ire_gateway_addr, ipst);
+		if (ire == NULL) {
+			/* Try ire_route_lookup */
+			ire = tmp_ire;
+		} else {
+			goto solved;
+		}
+	}
+	if (ire->ire_ipif != NULL)
+		match_flags |= MATCH_IRE_ILL_GROUP;
+
+	ire = ire_route_lookup(ire->ire_gateway_addr, 0,
+	    0, 0, ire->ire_ipif, NULL, zoneid, NULL, match_flags, ipst);
+solved:
+	DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire,
+	    (ire_t *), save_ire);
+	if (ire == NULL) {
+		/*
+		 * Do not release the parent ire if MATCH_IRE_PARENT
+		 * is set. Also return it via ire.
+		 */
+		ire_refrele(save_ire);
+		*pire = NULL;
+		return (ire);
+	}
+	if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
+		/*
+		 * If the caller did not ask for pire, release
+		 * it now.
+		 */
+		if (pire == NULL) {
+			ire_refrele(save_ire);
+		}
+	}
+	return (ire);
+}
 
 /*
  * Find an IRE_OFFSUBNET IRE entry for the multicast address 'group'
@@ -1085,6 +1238,246 @@ icmp_err_ret:
 		ire_refrele(ire);
 	}
 	return (NULL);
+}
+
+/*
+ * Since caller is ip_fast_forward, there is no CGTP or Tsol test
+ * Also we dont call ftable lookup with MATCH_IRE_PARENT
+ */
+
+ire_t *
+ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
+    ip_stack_t *ipst)
+{
+	ipaddr_t gw = 0;
+	ire_t	*ire = NULL;
+	ire_t   *sire = NULL, *save_ire;
+	ill_t *dst_ill = NULL;
+	int error;
+	zoneid_t zoneid;
+	ipif_t *src_ipif = NULL;
+	mblk_t *res_mp;
+	ushort_t ire_marks = 0;
+
+	zoneid = GLOBAL_ZONEID;
+
+
+	ire = ire_ftable_lookup_simple(dst, &sire, zoneid,
+	    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
+	    MATCH_IRE_RJ_BHOLE, ipst);
+
+	if (ire == NULL) {
+		ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst);
+		goto icmp_err_ret;
+	}
+
+	/*
+	 * Verify that the returned IRE does not have either
+	 * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
+	 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
+	 */
+	if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
+		ASSERT(ire->ire_type & (IRE_CACHE | IRE_INTERFACE));
+		ip3dbg(("ire 0x%p is not cache/resolver/noresolver\n",
+		    (void *)ire));
+		goto icmp_err_ret;
+	}
+
+	/*
+	 * If we already have a fully resolved IRE CACHE of the
+	 * nexthop router, just hand over the cache entry
+	 * and we are done.
+	 */
+
+	if (ire->ire_type & IRE_CACHE) {
+
+		/*
+		 * If we are using this ire cache entry as a
+		 * gateway to forward packets, chances are we
+		 * will be using it again. So turn off
+		 * the temporary flag, thus reducing its
+		 * chances of getting deleted frequently.
+		 */
+		if (ire->ire_marks & IRE_MARK_TEMPORARY) {
+			irb_t *irb = ire->ire_bucket;
+			rw_enter(&irb->irb_lock, RW_WRITER);
+			ire->ire_marks &= ~IRE_MARK_TEMPORARY;
+			irb->irb_tmp_ire_cnt--;
+			rw_exit(&irb->irb_lock);
+		}
+
+		if (sire != NULL) {
+			UPDATE_OB_PKT_COUNT(sire);
+			ire_refrele(sire);
+		}
+		*ret_action = Forward_ok;
+		return (ire);
+	}
+	/*
+	 * Increment the ire_ob_pkt_count field for ire if it is an
+	 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
+	 * increment the same for the parent IRE, sire, if it is some
+	 * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST).
+	 */
+	if ((ire->ire_type & IRE_INTERFACE) != 0) {
+		UPDATE_OB_PKT_COUNT(ire);
+		ire->ire_last_used_time = lbolt;
+	}
+
+	/*
+	 * sire must be either IRE_CACHETABLE OR IRE_INTERFACE type
+	 */
+	if (sire != NULL) {
+		gw = sire->ire_gateway_addr;
+		ASSERT((sire->ire_type &
+		    (IRE_CACHETABLE | IRE_INTERFACE)) == 0);
+		UPDATE_OB_PKT_COUNT(sire);
+	}
+
+	/* Obtain dst_ill */
+	dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill);
+	if (dst_ill == NULL) {
+		ip2dbg(("ire_forward no dst ill; ire 0x%p\n",
+		    (void *)ire));
+		goto icmp_err_ret;
+	}
+
+	ASSERT(src_ipif == NULL);
+	/* Now obtain the src_ipif */
+	src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill,
+	    zoneid, &ire_marks);
+	if (src_ipif == NULL)
+		goto icmp_err_ret;
+
+	switch (ire->ire_type) {
+	case IRE_IF_NORESOLVER:
+		/* create ire_cache for ire_addr endpoint */
+	case IRE_IF_RESOLVER:
+		/*
+		 * We have the IRE_IF_RESOLVER of the nexthop gateway
+		 * and now need to build a IRE_CACHE for it.
+		 * In this case, we have the following :
+		 *
+		 * 1) src_ipif - used for getting a source address.
+		 *
+		 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
+		 *    means packets using the IRE_CACHE that we will build
+		 *    here will go out on dst_ill.
+		 *
+		 * 3) sire may or may not be NULL. But, the IRE_CACHE that is
+		 *    to be created will only be tied to the IRE_INTERFACE
+		 *    that was derived from the ire_ihandle field.
+		 *
+		 *    If sire is non-NULL, it means the destination is
+		 *    off-link and we will first create the IRE_CACHE for the
+		 *    gateway.
+		 */
+		res_mp = dst_ill->ill_resolver_mp;
+		if (ire->ire_type == IRE_IF_RESOLVER &&
+		    (!OK_RESOLVER_MP(res_mp))) {
+			ire_refrele(ire);
+			ire = NULL;
+			goto out;
+		}
+		/*
+		 * To be at this point in the code with a non-zero gw
+		 * means that dst is reachable through a gateway that
+		 * we have never resolved.  By changing dst to the gw
+		 * addr we resolve the gateway first.
+		 */
+		if (gw != INADDR_ANY) {
+			/*
+			 * The source ipif that was determined above was
+			 * relative to the destination address, not the
+			 * gateway's. If src_ipif was not taken out of
+			 * the IRE_IF_RESOLVER entry, we'll need to call
+			 * ipif_select_source() again.
+			 */
+			if (src_ipif != ire->ire_ipif) {
+				ipif_refrele(src_ipif);
+				src_ipif = ipif_select_source(dst_ill,
+				    gw, zoneid);
+				if (src_ipif == NULL)
+					goto icmp_err_ret;
+			}
+			dst = gw;
+			gw = INADDR_ANY;
+		}
+
+		if (ire->ire_type == IRE_IF_NORESOLVER)
+			dst = ire->ire_addr; /* ire_cache for tunnel endpoint */
+
+		save_ire = ire;
+		/*
+		 * create an incomplete IRE_CACHE.
+		 * An areq_mp will be generated in ire_arpresolve() for
+		 * RESOLVER interfaces.
+		 */
+		ire = ire_create(
+		    (uchar_t *)&dst,		/* dest address */
+		    (uchar_t *)&ip_g_all_ones,	/* mask */
+		    (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
+		    (uchar_t *)&gw,		/* gateway address */
+		    (save_ire->ire_type == IRE_IF_RESOLVER ?  NULL:
+		    &save_ire->ire_max_frag),
+		    NULL,
+		    dst_ill->ill_rq,		/* recv-from queue */
+		    dst_ill->ill_wq,		/* send-to queue */
+		    IRE_CACHE,			/* IRE type */
+		    src_ipif,
+		    ire->ire_mask,		/* Parent mask */
+		    0,
+		    ire->ire_ihandle,	/* Interface handle */
+		    0,
+		    &(ire->ire_uinfo),
+		    NULL,
+		    NULL,
+		    ipst);
+		ip1dbg(("incomplete ire_cache 0x%p\n", (void *)ire));
+		if (ire != NULL) {
+			ire->ire_marks |= ire_marks;
+			/* add the incomplete ire: */
+			error = ire_add(&ire, NULL, NULL, NULL, B_TRUE);
+			if (error == 0 && ire != NULL) {
+				ire->ire_max_frag = save_ire->ire_max_frag;
+				ip1dbg(("setting max_frag to %d in ire 0x%p\n",
+				    ire->ire_max_frag, (void *)ire));
+			} else {
+				ire_refrele(save_ire);
+				goto icmp_err_ret;
+			}
+		}
+
+		ire_refrele(save_ire);
+		break;
+	default:
+		break;
+	}
+
+out:
+	*ret_action = Forward_ok;
+	if (sire != NULL)
+		ire_refrele(sire);
+	if (dst_ill != NULL)
+		ill_refrele(dst_ill);
+	if (src_ipif != NULL)
+		ipif_refrele(src_ipif);
+	return (ire);
+icmp_err_ret:
+	*ret_action = Forward_ret_icmp_err;
+	if (src_ipif != NULL)
+		ipif_refrele(src_ipif);
+	if (dst_ill != NULL)
+		ill_refrele(dst_ill);
+	if (sire != NULL)
+		ire_refrele(sire);
+	if (ire != NULL) {
+		if (ire->ire_flags & RTF_BLACKHOLE)
+			*ret_action = Forward_blackhole;
+		ire_refrele(ire);
+	}
+	/* caller needs to send icmp error message */
+	return (NULL);
 
 }
 
@@ -1439,7 +1832,7 @@ ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
 	 * if necessary and send it once ready.
 	 */
 
-	value = ip_xmit_v4(mp, ire_cache, NULL, B_FALSE);
+	value = ip_xmit_v4(mp, ire_cache, NULL, B_FALSE, NULL);
 cleanup:
 	ire_refrele(ire_cache);
 	/*
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 3b8ff6b5d9..d767b25a76 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -44,6 +44,8 @@
 #include <sys/sunldi.h>
 #include <sys/file.h>
 #include <sys/bitmap.h>
+#include <sys/cpuvar.h>
+#include <sys/time.h>
 #include <sys/kmem.h>
 #include <sys/systm.h>
 #include <sys/param.h>
@@ -62,6 +64,7 @@
 #include <sys/strsun.h>
 #include <sys/policy.h>
 #include <sys/ethernet.h>
+#include <sys/callb.h>
 
 #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
 #include <inet/mi.h>
@@ -94,7 +97,8 @@
 #include <netinet/igmp.h>
 #include <inet/ip_listutils.h>
 #include <inet/ipclassifier.h>
-#include <sys/mac.h>
+#include <sys/mac_client.h>
+#include <sys/dld.h>
 
 #include <sys/systeminfo.h>
 #include <sys/bootconf.h>
@@ -224,25 +228,27 @@ static void	ill_ipsec_capab_free(ill_ipsec_capab_t *);
 static void	ill_ipsec_capab_add(ill_t *, uint_t, boolean_t);
 static void	ill_ipsec_capab_delete(ill_t *, uint_t);
 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int);
-static void ill_capability_proto(ill_t *, int, mblk_t *);
 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *,
     boolean_t);
 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_mdt_reset(ill_t *, mblk_t **);
+static void ill_capability_mdt_reset_fill(ill_t *, mblk_t *);
 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_ipsec_reset(ill_t *, mblk_t **);
+static void ill_capability_ipsec_reset_fill(ill_t *, mblk_t *);
 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_hcksum_reset(ill_t *, mblk_t **);
+static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
     dl_capability_sub_t *);
-static void ill_capability_zerocopy_reset(ill_t *, mblk_t **);
-static void ill_capability_lso_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_lso_reset(ill_t *, mblk_t **);
-static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *);
-static void	ill_capability_dls_reset(ill_t *, mblk_t **);
-static void	ill_capability_dls_disable(ill_t *);
+static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
+static int  ill_capability_ipsec_reset_size(ill_t *, int *, int *, int *,
+    int *);
+static void	ill_capability_dld_reset_fill(ill_t *, mblk_t *);
+static void	ill_capability_dld_ack(ill_t *, mblk_t *,
+		    dl_capability_sub_t *);
+static void	ill_capability_dld_enable(ill_t *);
+static void	ill_capability_ack_thr(void *);
+static void	ill_capability_lso_enable(ill_t *);
+static void	ill_capability_send(ill_t *, mblk_t *);
 
 static void	illgrp_cache_delete(ire_t *, char *);
 static void	illgrp_delete(ill_t *ill);
@@ -523,16 +529,6 @@ static ipif_t	ipif_zero;
  */
 uint_t	ill_no_arena = 12;	/* Setable in /etc/system */
 
-/*
- * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout
- * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is
- * set through platform specific code (Niagara/Ontario).
- */
-#define	SOFT_RINGS_ENABLED()	(ip_soft_rings_cnt ? \
-		(ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE)
-
-#define	ILL_CAPAB_DLS	(ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL)
-
 static uint_t
 ipif_rand(ip_stack_t *ipst)
 {
@@ -824,12 +820,8 @@ ill_delete_tail(ill_t *ill)
 	while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
 		cv_wait(&ill->ill_cv, &ill->ill_lock);
 	mutex_exit(&ill->ill_lock);
-
-	/*
-	 * Clean up polling and soft ring capabilities
-	 */
-	if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))
-		ill_capability_dls_disable(ill);
+	ASSERT(!(ill->ill_capabilities &
+	    (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
 
 	if (ill->ill_net_type != IRE_LOOPBACK)
 		qprocsoff(ill->ill_rq);
@@ -879,17 +871,11 @@ ill_delete_tail(ill_t *ill)
 		ill->ill_lso_capab = NULL;
 	}
 
-	if (ill->ill_dls_capab != NULL) {
-		CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn);
-		ill->ill_dls_capab->ill_unbind_conn = NULL;
-		kmem_free(ill->ill_dls_capab,
-		    sizeof (ill_dls_capab_t) +
-		    (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS));
-		ill->ill_dls_capab = NULL;
+	if (ill->ill_dld_capab != NULL) {
+		kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
+		ill->ill_dld_capab = NULL;
 	}
 
-	ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL));
-
 	while (ill->ill_ipif != NULL)
 		ipif_free_tail(ill->ill_ipif);
 
@@ -1478,7 +1464,7 @@ conn_ioctl_cleanup(conn_t *connp)
 	refheld = ill_waiter_inc(ill);
 	mutex_exit(&connp->conn_lock);
 	if (refheld) {
-		if (ipsq_enter(ill, B_TRUE)) {
+		if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
 			ill_waiter_dcr(ill);
 			/*
 			 * Check whether this ioctl has started and is
@@ -1742,104 +1728,114 @@ ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
 void
 ill_capability_probe(ill_t *ill)
 {
+	mblk_t	*mp;
+
+	ASSERT(IAM_WRITER_ILL(ill));
+
+	if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
+	    ill->ill_dlpi_capab_state != IDCS_FAILED)
+		return;
+
 	/*
-	 * Do so only if capabilities are still unknown.
+	 * We are starting a new cycle of capability negotiation.
+	 * Free up the capab reset messages of any previous incarnation.
+	 * We will do a fresh allocation when we get the response to our probe
 	 */
-	if (ill->ill_dlpi_capab_state != IDS_UNKNOWN)
-		return;
+	if (ill->ill_capab_reset_mp != NULL) {
+		freemsg(ill->ill_capab_reset_mp);
+		ill->ill_capab_reset_mp = NULL;
+	}
 
-	ill->ill_dlpi_capab_state = IDS_INPROGRESS;
 	ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
-	ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL);
+
+	mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
+	if (mp == NULL)
+		return;
+
+	ill_capability_send(ill, mp);
+	ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
 }
 
 void
-ill_capability_reset(ill_t *ill)
-{
-	mblk_t *sc_mp = NULL;
-	mblk_t *tmp;
-
-	/*
-	 * Note here that we reset the state to UNKNOWN, and later send
-	 * down the DL_CAPABILITY_REQ without first setting the state to
-	 * INPROGRESS.  We do this in order to distinguish the
-	 * DL_CAPABILITY_ACK response which may come back in response to
-	 * a "reset" apart from the "probe" DL_CAPABILITY_REQ.  This would
-	 * also handle the case where the driver doesn't send us back
-	 * a DL_CAPABILITY_ACK in response, since the "probe" routine
-	 * requires the state to be in UNKNOWN anyway.  In any case, all
-	 * features are turned off until the state reaches IDS_OK.
-	 */
-	ill->ill_dlpi_capab_state = IDS_UNKNOWN;
-	ill->ill_capab_reneg = B_FALSE;
-
-	/*
-	 * Disable sub-capabilities and request a list of sub-capability
-	 * messages which will be sent down to the driver.  Each handler
-	 * allocates the corresponding dl_capability_sub_t inside an
-	 * mblk, and links it to the existing sc_mp mblk, or return it
-	 * as sc_mp if it's the first sub-capability (the passed in
-	 * sc_mp is NULL).  Upon returning from all capability handlers,
-	 * sc_mp will be pulled-up, before passing it downstream.
-	 */
-	ill_capability_mdt_reset(ill, &sc_mp);
-	ill_capability_hcksum_reset(ill, &sc_mp);
-	ill_capability_zerocopy_reset(ill, &sc_mp);
-	ill_capability_ipsec_reset(ill, &sc_mp);
-	ill_capability_dls_reset(ill, &sc_mp);
-	ill_capability_lso_reset(ill, &sc_mp);
-
-	/* Nothing to send down in order to disable the capabilities? */
-	if (sc_mp == NULL)
-		return;
+ill_capability_reset(ill_t *ill, boolean_t reneg)
+{
+	ASSERT(IAM_WRITER_ILL(ill));
 
-	tmp = msgpullup(sc_mp, -1);
-	freemsg(sc_mp);
-	if ((sc_mp = tmp) == NULL) {
-		cmn_err(CE_WARN, "ill_capability_reset: unable to send down "
-		    "DL_CAPABILITY_REQ (ENOMEM)\n");
+	if (ill->ill_dlpi_capab_state != IDCS_OK)
 		return;
-	}
 
-	ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n"));
-	ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp);
+	ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
+
+	ill_capability_send(ill, ill->ill_capab_reset_mp);
+	ill->ill_capab_reset_mp = NULL;
+	/*
+	 * We turn off all capabilities except those pertaining to
+	 * direct function call capabilities viz. ILL_CAPAB_DLD*
+	 * which will be turned off by the corresponding reset functions.
+	 */
+	ill->ill_capabilities &= ~(ILL_CAPAB_MDT | ILL_CAPAB_HCKSUM  |
+	    ILL_CAPAB_ZEROCOPY | ILL_CAPAB_AH | ILL_CAPAB_ESP);
 }
 
-/*
- * Request or set new-style hardware capabilities supported by DLS provider.
- */
 static void
-ill_capability_proto(ill_t *ill, int type, mblk_t *reqp)
+ill_capability_reset_alloc(ill_t *ill)
 {
 	mblk_t *mp;
-	dl_capability_req_t *capb;
-	size_t size = 0;
-	uint8_t *ptr;
+	size_t	size = 0;
+	int	err;
+	dl_capability_req_t	*capb;
 
-	if (reqp != NULL)
-		size = MBLKL(reqp);
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(ill->ill_capab_reset_mp == NULL);
 
-	mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type);
-	if (mp == NULL) {
-		freemsg(reqp);
-		return;
+	if (ILL_MDT_CAPABLE(ill))
+		size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t);
+
+	if (ILL_HCKSUM_CAPABLE(ill)) {
+		size += sizeof (dl_capability_sub_t) +
+		    sizeof (dl_capab_hcksum_t);
 	}
-	ptr = mp->b_rptr;
 
-	capb = (dl_capability_req_t *)ptr;
-	ptr += sizeof (dl_capability_req_t);
+	if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
+		size += sizeof (dl_capability_sub_t) +
+		    sizeof (dl_capab_zerocopy_t);
+	}
 
-	if (reqp != NULL) {
-		capb->dl_sub_offset = sizeof (dl_capability_req_t);
-		capb->dl_sub_length = size;
-		bcopy(reqp->b_rptr, ptr, size);
-		ptr += size;
-		mp->b_cont = reqp->b_cont;
-		freeb(reqp);
+	if (ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) {
+		size += sizeof (dl_capability_sub_t);
+		size += ill_capability_ipsec_reset_size(ill, NULL, NULL,
+		    NULL, NULL);
 	}
-	ASSERT(ptr == mp->b_wptr);
 
-	ill_dlpi_send(ill, mp);
+	if (ill->ill_capabilities & ILL_CAPAB_DLD) {
+		size += sizeof (dl_capability_sub_t) +
+		    sizeof (dl_capab_dld_t);
+	}
+
+	mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
+	    STR_NOSIG, &err);
+
+	mp->b_datap->db_type = M_PROTO;
+	bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
+
+	capb = (dl_capability_req_t *)mp->b_rptr;
+	capb->dl_primitive = DL_CAPABILITY_REQ;
+	capb->dl_sub_offset = sizeof (dl_capability_req_t);
+	capb->dl_sub_length = size;
+
+	mp->b_wptr += sizeof (dl_capability_req_t);
+
+	/*
+	 * Each handler fills in the corresponding dl_capability_sub_t
+	 * inside the mblk,
+	 */
+	ill_capability_mdt_reset_fill(ill, mp);
+	ill_capability_hcksum_reset_fill(ill, mp);
+	ill_capability_zerocopy_reset_fill(ill, mp);
+	ill_capability_ipsec_reset_fill(ill, mp);
+	ill_capability_dld_reset_fill(ill, mp);
+
+	ill->ill_capab_reset_mp = mp;
 }
 
 static void
@@ -1944,7 +1940,6 @@ ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
 		if (*ill_mdt_capab == NULL) {
 			*ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t),
 			    KM_NOSLEEP);
-
 			if (*ill_mdt_capab == NULL) {
 				cmn_err(CE_WARN, "ill_capability_mdt_ack: "
 				    "could not enable MDT version %d "
@@ -2017,42 +2012,22 @@ ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
 		mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE;
 
 		/* nmp points to a DL_CAPABILITY_REQ message to enable MDT */
-		ill_dlpi_send(ill, nmp);
+		ill_capability_send(ill, nmp);
 	}
 }
 
 static void
-ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp)
+ill_capability_mdt_reset_fill(ill_t *ill, mblk_t *mp)
 {
-	mblk_t *mp;
 	dl_capab_mdt_t *mdt_subcap;
 	dl_capability_sub_t *dl_subcap;
-	int size;
 
 	if (!ILL_MDT_CAPABLE(ill))
 		return;
 
 	ASSERT(ill->ill_mdt_capab != NULL);
-	/*
-	 * Clear the capability flag for MDT but retain the ill_mdt_capab
-	 * structure since it's possible that another thread is still
-	 * referring to it.  The structure only gets deallocated when
-	 * we destroy the ill.
-	 */
-	ill->ill_capabilities &= ~ILL_CAPAB_MDT;
-
-	size = sizeof (*dl_subcap) + sizeof (*mdt_subcap);
-
-	mp = allocb(size, BPRI_HI);
-	if (mp == NULL) {
-		ip1dbg(("ill_capability_mdt_reset: unable to allocate "
-		    "request to disable MDT\n"));
-		return;
-	}
 
-	mp->b_wptr = mp->b_rptr + size;
-
-	dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
+	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
 	dl_subcap->dl_cap = DL_CAPAB_MDT;
 	dl_subcap->dl_length = sizeof (*mdt_subcap);
 
@@ -2062,10 +2037,26 @@ ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp)
 	mdt_subcap->mdt_hdr_head = 0;
 	mdt_subcap->mdt_hdr_tail = 0;
 
-	if (*sc_mp != NULL)
-		linkb(*sc_mp, mp);
-	else
-		*sc_mp = mp;
+	mp->b_wptr += sizeof (*dl_subcap) + sizeof (*mdt_subcap);
+}
+
+static void
+ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
+{
+	dl_capability_sub_t *dl_subcap;
+
+	if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
+		return;
+
+	/*
+	 * The dl_capab_dld_t that follows the dl_capability_sub_t is not
+	 * initialized below since it is not used by DLD.
+	 */
+	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
+	dl_subcap->dl_cap = DL_CAPAB_DLD;
+	dl_subcap->dl_length = sizeof (dl_capab_dld_t);
+
+	mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
 }
 
 /*
@@ -2371,7 +2362,7 @@ ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
 		 * nmp points to a DL_CAPABILITY_REQ message to enable
 		 * IPsec hardware acceleration.
 		 */
-		ill_dlpi_send(ill, nmp);
+		ill_capability_send(ill, nmp);
 
 	if (need_sadb_dump)
 		/*
@@ -2457,10 +2448,10 @@ ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen,
 }
 
 /* ARGSUSED */
-static void
-ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
+static int
+ill_capability_ipsec_reset_size(ill_t *ill, int *ah_cntp, int *ah_lenp,
+    int *esp_cntp, int *esp_lenp)
 {
-	mblk_t *mp;
 	ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
 	ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
 	uint64_t ill_capabilities = ill->ill_capabilities;
@@ -2469,7 +2460,7 @@ ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
 	int i, size = 0;
 
 	if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)))
-		return;
+		return (0);
 
 	ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH));
 	ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP));
@@ -2504,18 +2495,32 @@ ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
 		}
 	}
 
-	if (size == 0) {
-		ip1dbg(("ill_capability_ipsec_reset: capabilities exist but "
-		    "there's nothing to reset\n"));
-		return;
-	}
+	if (ah_cntp != NULL)
+		*ah_cntp = ah_cnt;
+	if (ah_lenp != NULL)
+		*ah_lenp = ah_len;
+	if (esp_cntp != NULL)
+		*esp_cntp = esp_cnt;
+	if (esp_lenp != NULL)
+		*esp_lenp = esp_len;
 
-	mp = allocb(size, BPRI_HI);
-	if (mp == NULL) {
-		ip1dbg(("ill_capability_ipsec_reset: unable to allocate "
-		    "request to disable IPSEC Hardware Acceleration\n"));
+	return (size);
+}
+
+/* ARGSUSED */
+static void
+ill_capability_ipsec_reset_fill(ill_t *ill, mblk_t *mp)
+{
+	ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
+	ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
+	int ah_cnt = 0, esp_cnt = 0;
+	int ah_len = 0, esp_len = 0;
+	int size;
+
+	size = ill_capability_ipsec_reset_size(ill, &ah_cnt, &ah_len,
+	    &esp_cnt, &esp_len);
+	if (size == 0)
 		return;
-	}
 
 	/*
 	 * Clear the capability flags for IPsec HA but retain the ill
@@ -2527,20 +2532,17 @@ ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
 	 * hardware acceleration, and by clearing them we ensure that new
 	 * outbound IPsec packets are sent down encrypted.
 	 */
-	ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP);
 
 	/* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */
 	if (ah_cnt > 0) {
 		ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len,
 		    cap_ah, mp);
-		ASSERT(mp->b_rptr + size >= mp->b_wptr);
 	}
 
 	/* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */
 	if (esp_cnt > 0) {
 		ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len,
 		    cap_esp, mp);
-		ASSERT(mp->b_rptr + size >= mp->b_wptr);
 	}
 
 	/*
@@ -2550,11 +2552,6 @@ ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
 	 * must stop inbound decryption (by destroying all inbound SAs)
 	 * and let the corresponding packets come in encrypted.
 	 */
-
-	if (*sc_mp != NULL)
-		linkb(*sc_mp, mp);
-	else
-		*sc_mp = mp;
 }
 
 static void
@@ -2564,15 +2561,6 @@ ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
 	boolean_t legacy = B_FALSE;
 
 	/*
-	 * If this DL_CAPABILITY_ACK came in as a response to our "reset"
-	 * DL_CAPABILITY_REQ, ignore it during this cycle.  We've just
-	 * instructed the driver to disable its advertised capabilities,
-	 * so there's no point in accepting any response at this moment.
-	 */
-	if (ill->ill_dlpi_capab_state == IDS_UNKNOWN)
-		return;
-
-	/*
 	 * Note that only the following two sub-capabilities may be
 	 * considered as "legacy", since their original definitions
 	 * do not incorporate the dl_mid_t module ID token, and hence
@@ -2611,16 +2599,8 @@ ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
 	case DL_CAPAB_ZEROCOPY:
 		ill_capability_zerocopy_ack(ill, mp, subp);
 		break;
-	case DL_CAPAB_POLL:
-		if (!SOFT_RINGS_ENABLED())
-			ill_capability_dls_ack(ill, mp, subp);
-		break;
-	case DL_CAPAB_SOFT_RING:
-		if (SOFT_RINGS_ENABLED())
-			ill_capability_dls_ack(ill, mp, subp);
-		break;
-	case DL_CAPAB_LSO:
-		ill_capability_lso_ack(ill, mp, subp);
+	case DL_CAPAB_DLD:
+		ill_capability_dld_ack(ill, mp, subp);
 		break;
 	default:
 		ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
@@ -2629,407 +2609,6 @@ ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
 }
 
 /*
- * As part of negotiating polling capability, the driver tells us
- * the default (or normal) blanking interval and packet threshold
- * (the receive timer fires if blanking interval is reached or
- * the packet threshold is reached).
- *
- * As part of manipulating the polling interval, we always use our
- * estimated interval (avg service time * number of packets queued
- * on the squeue) but we try to blank for a minimum of
- * rr_normal_blank_time * rr_max_blank_ratio. We disable the
- * packet threshold during this time. When we are not in polling mode
- * we set the blank interval typically lower, rr_normal_pkt_cnt *
- * rr_min_blank_ratio but up the packet cnt by a ratio of
- * rr_min_pkt_cnt_ratio so that we are still getting chains if
- * possible although for a shorter interval.
- */
-#define	RR_MAX_BLANK_RATIO	20
-#define	RR_MIN_BLANK_RATIO	10
-#define	RR_MAX_PKT_CNT_RATIO	3
-#define	RR_MIN_PKT_CNT_RATIO	3
-
-/*
- * These can be tuned via /etc/system.
- */
-int rr_max_blank_ratio = RR_MAX_BLANK_RATIO;
-int rr_min_blank_ratio = RR_MIN_BLANK_RATIO;
-int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO;
-int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO;
-
-static mac_resource_handle_t
-ill_ring_add(void *arg, mac_resource_t *mrp)
-{
-	ill_t			*ill = (ill_t *)arg;
-	mac_rx_fifo_t		*mrfp = (mac_rx_fifo_t *)mrp;
-	ill_rx_ring_t		*rx_ring;
-	int			ip_rx_index;
-
-	ASSERT(mrp != NULL);
-	if (mrp->mr_type != MAC_RX_FIFO) {
-		return (NULL);
-	}
-	ASSERT(ill != NULL);
-	ASSERT(ill->ill_dls_capab != NULL);
-
-	mutex_enter(&ill->ill_lock);
-	for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
-		rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index];
-		ASSERT(rx_ring != NULL);
-
-		if (rx_ring->rr_ring_state == ILL_RING_FREE) {
-			time_t normal_blank_time =
-			    mrfp->mrf_normal_blank_time;
-			uint_t normal_pkt_cnt =
-			    mrfp->mrf_normal_pkt_count;
-
-	bzero(rx_ring, sizeof (ill_rx_ring_t));
-
-	rx_ring->rr_blank = mrfp->mrf_blank;
-	rx_ring->rr_handle = mrfp->mrf_arg;
-	rx_ring->rr_ill = ill;
-	rx_ring->rr_normal_blank_time = normal_blank_time;
-	rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt;
-
-			rx_ring->rr_max_blank_time =
-			    normal_blank_time * rr_max_blank_ratio;
-			rx_ring->rr_min_blank_time =
-			    normal_blank_time * rr_min_blank_ratio;
-			rx_ring->rr_max_pkt_cnt =
-			    normal_pkt_cnt * rr_max_pkt_cnt_ratio;
-			rx_ring->rr_min_pkt_cnt =
-			    normal_pkt_cnt * rr_min_pkt_cnt_ratio;
-
-			rx_ring->rr_ring_state = ILL_RING_INUSE;
-			mutex_exit(&ill->ill_lock);
-
-			DTRACE_PROBE2(ill__ring__add, (void *), ill,
-			    (int), ip_rx_index);
-			return ((mac_resource_handle_t)rx_ring);
-		}
-	}
-
-	/*
-	 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
-	 * we have devices which can overwhelm this limit, ILL_MAX_RING
-	 * should be made configurable. Meanwhile it cause no panic because
-	 * driver will pass ip_input a NULL handle which will make
-	 * IP allocate the default squeue and Polling mode will not
-	 * be used for this ring.
-	 */
-	cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) "
-	    "for %s\n", ILL_MAX_RINGS, ill->ill_name);
-
-	mutex_exit(&ill->ill_lock);
-	return (NULL);
-}
-
-static boolean_t
-ill_capability_dls_init(ill_t *ill)
-{
-	ill_dls_capab_t	*ill_dls = ill->ill_dls_capab;
-	conn_t 			*connp;
-	size_t			sz;
-	ip_stack_t *ipst = ill->ill_ipst;
-
-	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
-		if (ill_dls == NULL) {
-			cmn_err(CE_PANIC, "ill_capability_dls_init: "
-			    "soft_ring enabled for ill=%s (%p) but data "
-			    "structs uninitialized\n", ill->ill_name,
-			    (void *)ill);
-		}
-		return (B_TRUE);
-	} else if (ill->ill_capabilities & ILL_CAPAB_POLL) {
-		if (ill_dls == NULL) {
-			cmn_err(CE_PANIC, "ill_capability_dls_init: "
-			    "polling enabled for ill=%s (%p) but data "
-			    "structs uninitialized\n", ill->ill_name,
-			    (void *)ill);
-		}
-		return (B_TRUE);
-	}
-
-	if (ill_dls != NULL) {
-		ill_rx_ring_t 	*rx_ring = ill_dls->ill_ring_tbl;
-		/* Soft_Ring or polling is being re-enabled */
-
-		connp = ill_dls->ill_unbind_conn;
-		ASSERT(rx_ring != NULL);
-		bzero((void *)ill_dls, sizeof (ill_dls_capab_t));
-		bzero((void *)rx_ring,
-		    sizeof (ill_rx_ring_t) * ILL_MAX_RINGS);
-		ill_dls->ill_ring_tbl = rx_ring;
-		ill_dls->ill_unbind_conn = connp;
-		return (B_TRUE);
-	}
-
-	if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
-	    ipst->ips_netstack)) == NULL)
-		return (B_FALSE);
-
-	sz = sizeof (ill_dls_capab_t);
-	sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS;
-
-	ill_dls = kmem_zalloc(sz, KM_NOSLEEP);
-	if (ill_dls == NULL) {
-		cmn_err(CE_WARN, "ill_capability_dls_init: could not "
-		    "allocate dls_capab for %s (%p)\n", ill->ill_name,
-		    (void *)ill);
-		CONN_DEC_REF(connp);
-		return (B_FALSE);
-	}
-
-	/* Allocate space to hold ring table */
-	ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1];
-	ill->ill_dls_capab = ill_dls;
-	ill_dls->ill_unbind_conn = connp;
-	return (B_TRUE);
-}
-
-/*
- * ill_capability_dls_disable: disable soft_ring and/or polling
- * capability. Since any of the rings might already be in use, need
- * to call ip_squeue_clean_all() which gets behind the squeue to disable
- * direct calls if necessary.
- */
-static void
-ill_capability_dls_disable(ill_t *ill)
-{
-	ill_dls_capab_t	*ill_dls = ill->ill_dls_capab;
-
-	if (ill->ill_capabilities & ILL_CAPAB_DLS) {
-		ip_squeue_clean_all(ill);
-		ill_dls->ill_tx = NULL;
-		ill_dls->ill_tx_handle = NULL;
-		ill_dls->ill_dls_change_status = NULL;
-		ill_dls->ill_dls_bind = NULL;
-		ill_dls->ill_dls_unbind = NULL;
-	}
-
-	ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS));
-}
-
-static void
-ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls,
-    dl_capability_sub_t *isub)
-{
-	uint_t			size;
-	uchar_t			*rptr;
-	dl_capab_dls_t	dls, *odls;
-	ill_dls_capab_t	*ill_dls;
-	mblk_t			*nmp = NULL;
-	dl_capability_req_t	*ocap;
-	uint_t			sub_dl_cap = isub->dl_cap;
-
-	if (!ill_capability_dls_init(ill))
-		return;
-	ill_dls = ill->ill_dls_capab;
-
-	/* Copy locally to get the members aligned */
-	bcopy((void *)idls, (void *)&dls,
-	    sizeof (dl_capab_dls_t));
-
-	/* Get the tx function and handle from dld */
-	ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx;
-	ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle;
-
-	if (sub_dl_cap == DL_CAPAB_SOFT_RING) {
-		ill_dls->ill_dls_change_status =
-		    (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status;
-		ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind;
-		ill_dls->ill_dls_unbind =
-		    (ip_dls_unbind_t)dls.dls_ring_unbind;
-		ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt;
-	}
-
-	size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) +
-	    isub->dl_length;
-
-	if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
-		cmn_err(CE_WARN, "ill_capability_dls_capable: could "
-		    "not allocate memory for CAPAB_REQ for %s (%p)\n",
-		    ill->ill_name, (void *)ill);
-		return;
-	}
-
-	/* initialize dl_capability_req_t */
-	rptr = nmp->b_rptr;
-	ocap = (dl_capability_req_t *)rptr;
-	ocap->dl_sub_offset = sizeof (dl_capability_req_t);
-	ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length;
-	rptr += sizeof (dl_capability_req_t);
-
-	/* initialize dl_capability_sub_t */
-	bcopy(isub, rptr, sizeof (*isub));
-	rptr += sizeof (*isub);
-
-	odls = (dl_capab_dls_t *)rptr;
-	rptr += sizeof (dl_capab_dls_t);
-
-	/* initialize dl_capab_dls_t to be sent down */
-	dls.dls_rx_handle = (uintptr_t)ill;
-	dls.dls_rx = (uintptr_t)ip_input;
-	dls.dls_ring_add = (uintptr_t)ill_ring_add;
-
-	if (sub_dl_cap == DL_CAPAB_SOFT_RING) {
-		dls.dls_ring_cnt = ip_soft_rings_cnt;
-		dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment;
-		dls.dls_flags = SOFT_RING_ENABLE;
-	} else {
-		dls.dls_flags = POLL_ENABLE;
-		ip1dbg(("ill_capability_dls_capable: asking interface %s "
-		    "to enable polling\n", ill->ill_name));
-	}
-	bcopy((void *)&dls, (void *)odls,
-	    sizeof (dl_capab_dls_t));
-	ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
-	/*
-	 * nmp points to a DL_CAPABILITY_REQ message to
-	 * enable either soft_ring or polling
-	 */
-	ill_dlpi_send(ill, nmp);
-}
-
-static void
-ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp)
-{
-	mblk_t *mp;
-	dl_capab_dls_t *idls;
-	dl_capability_sub_t *dl_subcap;
-	int size;
-
-	if (!(ill->ill_capabilities & ILL_CAPAB_DLS))
-		return;
-
-	ASSERT(ill->ill_dls_capab != NULL);
-
-	size = sizeof (*dl_subcap) + sizeof (*idls);
-
-	mp = allocb(size, BPRI_HI);
-	if (mp == NULL) {
-		ip1dbg(("ill_capability_dls_reset: unable to allocate "
-		    "request to disable soft_ring\n"));
-		return;
-	}
-
-	mp->b_wptr = mp->b_rptr + size;
-
-	dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
-	dl_subcap->dl_length = sizeof (*idls);
-	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING)
-		dl_subcap->dl_cap = DL_CAPAB_SOFT_RING;
-	else
-		dl_subcap->dl_cap = DL_CAPAB_POLL;
-
-	idls = (dl_capab_dls_t *)(dl_subcap + 1);
-	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING)
-		idls->dls_flags = SOFT_RING_DISABLE;
-	else
-		idls->dls_flags = POLL_DISABLE;
-
-	if (*sc_mp != NULL)
-		linkb(*sc_mp, mp);
-	else
-		*sc_mp = mp;
-}
-
-/*
- * Process a soft_ring/poll capability negotiation ack received
- * from a DLS Provider.isub must point to the sub-capability
- * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message.
- */
-static void
-ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
-{
-	dl_capab_dls_t		*idls;
-	uint_t			sub_dl_cap = isub->dl_cap;
-	uint8_t			*capend;
-
-	ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING ||
-	    sub_dl_cap == DL_CAPAB_POLL);
-
-	if (ill->ill_isv6)
-		return;
-
-	/*
-	 * Note: range checks here are not absolutely sufficient to
-	 * make us robust against malformed messages sent by drivers;
-	 * this is in keeping with the rest of IP's dlpi handling.
-	 * (Remember, it's coming from something else in the kernel
-	 * address space)
-	 */
-	capend = (uint8_t *)(isub + 1) + isub->dl_length;
-	if (capend > mp->b_wptr) {
-		cmn_err(CE_WARN, "ill_capability_dls_ack: "
-		    "malformed sub-capability too long for mblk");
-		return;
-	}
-
-	/*
-	 * There are two types of acks we process here:
-	 * 1. acks in reply to a (first form) generic capability req
-	 *    (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE)
-	 * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE
-	 *    capability req.
-	 */
-	idls = (dl_capab_dls_t *)(isub + 1);
-
-	if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) {
-		ip1dbg(("ill_capability_dls_ack: mid token for dls "
-		    "capability isn't as expected; pass-thru "
-		    "module(s) detected, discarding capability\n"));
-		if (ill->ill_capabilities & ILL_CAPAB_DLS) {
-			/*
-			 * This is a capability renegotitation case.
-			 * The interface better be unusable at this
-			 * point other wise bad things will happen
-			 * if we disable direct calls on a running
-			 * and up interface.
-			 */
-			ill_capability_dls_disable(ill);
-		}
-		return;
-	}
-
-	switch (idls->dls_flags) {
-	default:
-		/* Disable if unknown flag */
-	case SOFT_RING_DISABLE:
-	case POLL_DISABLE:
-		ill_capability_dls_disable(ill);
-		break;
-	case SOFT_RING_CAPABLE:
-	case POLL_CAPABLE:
-		/*
-		 * If the capability was already enabled, its safe
-		 * to disable it first to get rid of stale information
-		 * and then start enabling it again.
-		 */
-		ill_capability_dls_disable(ill);
-		ill_capability_dls_capable(ill, idls, isub);
-		break;
-	case SOFT_RING_ENABLE:
-	case POLL_ENABLE:
-		mutex_enter(&ill->ill_lock);
-		if (sub_dl_cap == DL_CAPAB_SOFT_RING &&
-		    !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) {
-			ASSERT(ill->ill_dls_capab != NULL);
-			ill->ill_capabilities |= ILL_CAPAB_SOFT_RING;
-		}
-		if (sub_dl_cap == DL_CAPAB_POLL &&
-		    !(ill->ill_capabilities & ILL_CAPAB_POLL)) {
-			ASSERT(ill->ill_dls_capab != NULL);
-			ill->ill_capabilities |= ILL_CAPAB_POLL;
-			ip1dbg(("ill_capability_dls_ack: interface %s "
-			    "has enabled polling\n", ill->ill_name));
-		}
-		mutex_exit(&ill->ill_lock);
-		break;
-	}
-}
-
-/*
  * Process a hardware checksum offload capability negotiation ack received
  * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
  * of a DL_CAPABILITY_ACK message.
@@ -3164,7 +2743,7 @@ ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
 		 * nmp points to a DL_CAPABILITY_REQ message to enable
 		 * hardware checksum acceleration.
 		 */
-		ill_dlpi_send(ill, nmp);
+		ill_capability_send(ill, nmp);
 	} else {
 		ip1dbg(("ill_capability_hcksum_ack: interface %s has "
 		    "advertised %x hardware checksum capability flags\n",
@@ -3173,37 +2752,17 @@ ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
 }
 
 static void
-ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp)
+ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
 {
-	mblk_t *mp;
 	dl_capab_hcksum_t *hck_subcap;
 	dl_capability_sub_t *dl_subcap;
-	int size;
 
 	if (!ILL_HCKSUM_CAPABLE(ill))
 		return;
 
 	ASSERT(ill->ill_hcksum_capab != NULL);
-	/*
-	 * Clear the capability flag for hardware checksum offload but
-	 * retain the ill_hcksum_capab structure since it's possible that
-	 * another thread is still referring to it.  The structure only
-	 * gets deallocated when we destroy the ill.
-	 */
-	ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM;
 
-	size = sizeof (*dl_subcap) + sizeof (*hck_subcap);
-
-	mp = allocb(size, BPRI_HI);
-	if (mp == NULL) {
-		ip1dbg(("ill_capability_hcksum_reset: unable to allocate "
-		    "request to disable hardware checksum offload\n"));
-		return;
-	}
-
-	mp->b_wptr = mp->b_rptr + size;
-
-	dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
+	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
 	dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
 	dl_subcap->dl_length = sizeof (*hck_subcap);
 
@@ -3211,10 +2770,7 @@ ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp)
 	hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
 	hck_subcap->hcksum_txflags = 0;
 
-	if (*sc_mp != NULL)
-		linkb(*sc_mp, mp);
-	else
-		*sc_mp = mp;
+	mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
 }
 
 static void
@@ -3325,42 +2881,22 @@ ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
 		zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
 
 		/* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
-		ill_dlpi_send(ill, nmp);
+		ill_capability_send(ill, nmp);
 	}
 }
 
 static void
-ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp)
+ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
 {
-	mblk_t *mp;
 	dl_capab_zerocopy_t *zerocopy_subcap;
 	dl_capability_sub_t *dl_subcap;
-	int size;
 
 	if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
 		return;
 
 	ASSERT(ill->ill_zerocopy_capab != NULL);
-	/*
-	 * Clear the capability flag for Zero-copy but retain the
-	 * ill_zerocopy_capab structure since it's possible that another
-	 * thread is still referring to it.  The structure only gets
-	 * deallocated when we destroy the ill.
-	 */
-	ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY;
-
-	size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
-
-	mp = allocb(size, BPRI_HI);
-	if (mp == NULL) {
-		ip1dbg(("ill_capability_zerocopy_reset: unable to allocate "
-		    "request to disable Zero-copy\n"));
-		return;
-	}
 
-	mp->b_wptr = mp->b_rptr + size;
-
-	dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
+	dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
 	dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
 	dl_subcap->dl_length = sizeof (*zerocopy_subcap);
 
@@ -3369,30 +2905,24 @@ ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp)
 	    ill->ill_zerocopy_capab->ill_zerocopy_version;
 	zerocopy_subcap->zerocopy_flags = 0;
 
-	if (*sc_mp != NULL)
-		linkb(*sc_mp, mp);
-	else
-		*sc_mp = mp;
+	mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
 }
 
 /*
- * Process Large Segment Offload capability negotiation ack received from a
- * DLS Provider.  isub must point to the sub-capability (DL_CAPAB_LSO) of a
- * DL_CAPABILITY_ACK message.
+ * DLD capability
+ * Refer to dld.h for more information regarding the purpose and usage
+ * of this capability.
  */
 static void
-ill_capability_lso_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
+ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
 {
-	mblk_t *nmp = NULL;
-	dl_capability_req_t *oc;
-	dl_capab_lso_t *lso_ic, *lso_oc;
-	ill_lso_capab_t **ill_lso_capab;
-	uint_t sub_dl_cap = isub->dl_cap;
-	uint8_t *capend;
-
-	ASSERT(sub_dl_cap == DL_CAPAB_LSO);
+	dl_capab_dld_t		*dld_ic, dld;
+	uint_t			sub_dl_cap = isub->dl_cap;
+	uint8_t			*capend;
+	ill_dld_capab_t		*idc;
 
-	ill_lso_capab = (ill_lso_capab_t **)&ill->ill_lso_capab;
+	ASSERT(IAM_WRITER_ILL(ill));
+	ASSERT(sub_dl_cap == DL_CAPAB_DLD);
 
 	/*
 	 * Note: range checks here are not absolutely sufficient to
@@ -3403,165 +2933,395 @@ ill_capability_lso_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
 	 */
 	capend = (uint8_t *)(isub + 1) + isub->dl_length;
 	if (capend > mp->b_wptr) {
-		cmn_err(CE_WARN, "ill_capability_lso_ack: "
+		cmn_err(CE_WARN, "ill_capability_dld_ack: "
 		    "malformed sub-capability too long for mblk");
 		return;
 	}
-
-	lso_ic = (dl_capab_lso_t *)(isub + 1);
-
-	if (lso_ic->lso_version != LSO_VERSION_1) {
-		cmn_err(CE_CONT, "ill_capability_lso_ack: "
-		    "unsupported LSO sub-capability (version %d, expected %d)",
-		    lso_ic->lso_version, LSO_VERSION_1);
+	dld_ic = (dl_capab_dld_t *)(isub + 1);
+	if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
+		cmn_err(CE_CONT, "ill_capability_dld_ack: "
+		    "unsupported DLD sub-capability (version %d, "
+		    "expected %d)", dld_ic->dld_version,
+		    DLD_CURRENT_VERSION);
 		return;
 	}
-
-	if (!dlcapabcheckqid(&lso_ic->lso_mid, ill->ill_lmod_rq)) {
-		ip1dbg(("ill_capability_lso_ack: mid token for LSO "
+	if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
+		ip1dbg(("ill_capability_dld_ack: mid token for dld "
 		    "capability isn't as expected; pass-thru module(s) "
 		    "detected, discarding capability\n"));
 		return;
 	}
 
-	if ((lso_ic->lso_flags & LSO_TX_ENABLE) &&
-	    (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4)) {
-		if (*ill_lso_capab == NULL) {
-			*ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
-			    KM_NOSLEEP);
+	/*
+	 * Copy locally to ensure alignment.
+	 */
+	bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
 
-			if (*ill_lso_capab == NULL) {
-				cmn_err(CE_WARN, "ill_capability_lso_ack: "
-				    "could not enable LSO version %d "
-				    "for %s (ENOMEM)\n", LSO_VERSION_1,
-				    ill->ill_name);
-				return;
-			}
+	if ((idc = ill->ill_dld_capab) == NULL) {
+		idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
+		if (idc == NULL) {
+			cmn_err(CE_WARN, "ill_capability_dld_ack: "
+			    "could not enable DLD version %d "
+			    "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
+			    ill->ill_name);
+			return;
 		}
+		idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
+		idc->idc_capab_dh = (void *)dld.dld_capab_handle;
+		ill->ill_dld_capab = idc;
+	}
+	ip1dbg(("ill_capability_dld_ack: interface %s "
+	    "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
 
-		(*ill_lso_capab)->ill_lso_version = lso_ic->lso_version;
-		(*ill_lso_capab)->ill_lso_flags = lso_ic->lso_flags;
-		(*ill_lso_capab)->ill_lso_max = lso_ic->lso_max;
-		ill->ill_capabilities |= ILL_CAPAB_LSO;
+	ill_capability_dld_enable(ill);
+}
 
-		ip1dbg(("ill_capability_lso_ack: interface %s "
-		    "has enabled LSO\n ", ill->ill_name));
-	} else if (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4) {
-		uint_t size;
-		uchar_t *rptr;
+/*
+ * Typically capability negotiation between IP and the driver happens via
+ * DLPI message exchange. However GLD also offers a direct function call
+ * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
+ * But arbitrary function calls into IP or GLD are not permitted, since both
+ * of them are protected by their own perimeter mechanism. The perimeter can
+ * be viewed as a coarse lock or serialization mechanism. The hierarchy of
+ * these perimeters is IP -> MAC. Thus for example to enable the squeue
+ * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
+ * to enter the mac perimeter and then do the direct function calls into
+ * GLD to enable squeue polling. The ring related callbacks from the mac into
+ * the stack to add, bind, quiesce, restart or cleanup a ring are all
+ * protected by the mac perimeter.
+ */
+static void
+ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
+{
+	ill_dld_capab_t		*idc = ill->ill_dld_capab;
+	int			err;
 
-		size = sizeof (dl_capability_req_t) +
-		    sizeof (dl_capability_sub_t) + sizeof (dl_capab_lso_t);
+	err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
+	    DLD_ENABLE);
+	ASSERT(err == 0);
+}
 
-		if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
-			cmn_err(CE_WARN, "ill_capability_lso_ack: "
+static void
+ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
+{
+	ill_dld_capab_t		*idc = ill->ill_dld_capab;
+	int			err;
+
+	err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
+	    DLD_DISABLE);
+	ASSERT(err == 0);
+}
+
+boolean_t
+ill_mac_perim_held(ill_t *ill)
+{
+	ill_dld_capab_t		*idc = ill->ill_dld_capab;
+
+	return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
+	    DLD_QUERY));
+}
+
+static void
+ill_capability_direct_enable(ill_t *ill)
+{
+	ill_dld_capab_t		*idc = ill->ill_dld_capab;
+	ill_dld_direct_t	*idd = &idc->idc_direct;
+	dld_capab_direct_t	direct;
+	int			rc;
+
+	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
+
+	bzero(&direct, sizeof (direct));
+	direct.di_rx_cf = (uintptr_t)ip_input;
+	direct.di_rx_ch = ill;
+
+	rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
+	    DLD_ENABLE);
+	if (rc == 0) {
+		idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
+		idd->idd_tx_dh = direct.di_tx_dh;
+		idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
+		idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
+		/*
+		 * One time registration of flow enable callback function
+		 */
+		ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
+		    ill_flow_enable, ill);
+		ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
+		DTRACE_PROBE1(direct_on, (ill_t *), ill);
+	} else {
+		cmn_err(CE_WARN, "warning: could not enable DIRECT "
+		    "capability, rc = %d\n", rc);
+		DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
+	}
+}
+
+static void
+ill_capability_poll_enable(ill_t *ill)
+{
+	ill_dld_capab_t		*idc = ill->ill_dld_capab;
+	dld_capab_poll_t	poll;
+	int			rc;
+
+	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
+
+	bzero(&poll, sizeof (poll));
+	poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
+	poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
+	poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
+	poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
+	poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
+	poll.poll_ring_ch = ill;
+	rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
+	    DLD_ENABLE);
+	if (rc == 0) {
+		ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
+		DTRACE_PROBE1(poll_on, (ill_t *), ill);
+	} else {
+		ip1dbg(("warning: could not enable POLL "
+		    "capability, rc = %d\n", rc));
+		DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
+	}
+}
+
+/*
+ * Enable the LSO capability.
+ */
+static void
+ill_capability_lso_enable(ill_t *ill)
+{
+	ill_dld_capab_t	*idc = ill->ill_dld_capab;
+	dld_capab_lso_t	lso;
+	int rc;
+
+	ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
+
+	if (ill->ill_lso_capab == NULL) {
+		ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
+		    KM_NOSLEEP);
+		if (ill->ill_lso_capab == NULL) {
+			cmn_err(CE_WARN, "ill_capability_lso_enable: "
 			    "could not enable LSO for %s (ENOMEM)\n",
 			    ill->ill_name);
 			return;
 		}
+	}
 
-		rptr = nmp->b_rptr;
-		/* initialize dl_capability_req_t */
-		oc = (dl_capability_req_t *)nmp->b_rptr;
-		oc->dl_sub_offset = sizeof (dl_capability_req_t);
-		oc->dl_sub_length = sizeof (dl_capability_sub_t) +
-		    sizeof (dl_capab_lso_t);
-		nmp->b_rptr += sizeof (dl_capability_req_t);
-
-		/* initialize dl_capability_sub_t */
-		bcopy(isub, nmp->b_rptr, sizeof (*isub));
-		nmp->b_rptr += sizeof (*isub);
+	bzero(&lso, sizeof (lso));
+	if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
+	    DLD_ENABLE)) == 0) {
+		ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
+		ill->ill_lso_capab->ill_lso_max = lso.lso_max;
+		ill->ill_capabilities |= ILL_CAPAB_DLD_LSO;
+		ip1dbg(("ill_capability_lso_enable: interface %s "
+		    "has enabled LSO\n ", ill->ill_name));
+	} else {
+		kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
+		ill->ill_lso_capab = NULL;
+		DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
+	}
+}
 
-		/* initialize dl_capab_lso_t */
-		lso_oc = (dl_capab_lso_t *)nmp->b_rptr;
-		bcopy(lso_ic, lso_oc, sizeof (*lso_ic));
+static void
+ill_capability_dld_enable(ill_t *ill)
+{
+	mac_perim_handle_t mph;
 
-		nmp->b_rptr = rptr;
-		ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
+	ASSERT(IAM_WRITER_ILL(ill));
 
-		/* set ENABLE flag */
-		lso_oc->lso_flags |= LSO_TX_ENABLE;
+	if (ill->ill_isv6)
+		return;
 
-		/* nmp points to a DL_CAPABILITY_REQ message to enable LSO */
-		ill_dlpi_send(ill, nmp);
-	} else {
-		ip1dbg(("ill_capability_lso_ack: interface %s has "
-		    "advertised %x LSO capability flags\n",
-		    ill->ill_name, lso_ic->lso_flags));
+	ill_mac_perim_enter(ill, &mph);
+	if (!ill->ill_isv6) {
+		ill_capability_direct_enable(ill);
+		ill_capability_poll_enable(ill);
+		ill_capability_lso_enable(ill);
 	}
+	ill->ill_capabilities |= ILL_CAPAB_DLD;
+	ill_mac_perim_exit(ill, mph);
 }
 
 static void
-ill_capability_lso_reset(ill_t *ill, mblk_t **sc_mp)
+ill_capability_dld_disable(ill_t *ill)
 {
-	mblk_t *mp;
-	dl_capab_lso_t *lso_subcap;
-	dl_capability_sub_t *dl_subcap;
-	int size;
+	ill_dld_capab_t	*idc;
+	ill_dld_direct_t *idd;
+	mac_perim_handle_t	mph;
 
-	if (!(ill->ill_capabilities & ILL_CAPAB_LSO))
+	ASSERT(IAM_WRITER_ILL(ill));
+
+	if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
 		return;
 
-	ASSERT(ill->ill_lso_capab != NULL);
-	/*
-	 * Clear the capability flag for LSO but retain the
-	 * ill_lso_capab structure since it's possible that another
-	 * thread is still referring to it.  The structure only gets
-	 * deallocated when we destroy the ill.
-	 */
-	ill->ill_capabilities &= ~ILL_CAPAB_LSO;
+	ill_mac_perim_enter(ill, &mph);
+
+	idc = ill->ill_dld_capab;
+	if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
+		/*
+		 * For performance we avoid locks in the transmit data path
+		 * and don't maintain a count of the number of threads using
+		 * direct calls. Thus some threads could be using direct
+		 * transmit calls to GLD, even after the capability mechanism
+		 * turns it off. This is still safe since the handles used in
+		 * the direct calls continue to be valid until the unplumb is
+		 * completed. Remove the callback that was added (1-time) at
+		 * capab enable time.
+		 */
+		mutex_enter(&ill->ill_lock);
+		ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
+		mutex_exit(&ill->ill_lock);
+		if (ill->ill_flownotify_mh != NULL) {
+			idd = &idc->idc_direct;
+			idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
+			    ill->ill_flownotify_mh);
+			ill->ill_flownotify_mh = NULL;
+		}
+		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
+		    NULL, DLD_DISABLE);
+	}
 
-	size = sizeof (*dl_subcap) + sizeof (*lso_subcap);
+	if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
+		ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
+		ip_squeue_clean_all(ill);
+		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
+		    NULL, DLD_DISABLE);
+	}
 
-	mp = allocb(size, BPRI_HI);
-	if (mp == NULL) {
-		ip1dbg(("ill_capability_lso_reset: unable to allocate "
-		    "request to disable LSO\n"));
-		return;
+	if ((ill->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0) {
+		ASSERT(ill->ill_lso_capab != NULL);
+		/*
+		 * Clear the capability flag for LSO but retain the
+		 * ill_lso_capab structure since it's possible that another
+		 * thread is still referring to it.  The structure only gets
+		 * deallocated when we destroy the ill.
+		 */
+
+		ill->ill_capabilities &= ~ILL_CAPAB_DLD_LSO;
+		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
+		    NULL, DLD_DISABLE);
 	}
 
-	mp->b_wptr = mp->b_rptr + size;
+	ill->ill_capabilities &= ~ILL_CAPAB_DLD;
+	ill_mac_perim_exit(ill, mph);
+}
 
-	dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
-	dl_subcap->dl_cap = DL_CAPAB_LSO;
-	dl_subcap->dl_length = sizeof (*lso_subcap);
+/*
+ * Capability Negotiation protocol
+ *
+ * We don't wait for DLPI capability operations to finish during interface
+ * bringup or teardown. Doing so would introduce more asynchrony and the
+ * interface up/down operations will need multiple return and restarts.
+ * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
+ * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
+ * exclusive operation won't start until the DLPI operations of the previous
+ * exclusive operation complete.
+ *
+ * The capability state machine is shown below.
+ *
+ * state		next state		event, action
+ *
+ * IDCS_UNKNOWN 	IDCS_PROBE_SENT		ill_capability_probe
+ * IDCS_PROBE_SENT	IDCS_OK			ill_capability_ack
+ * IDCS_PROBE_SENT	IDCS_FAILED		ip_rput_dlpi_writer (nack)
+ * IDCS_OK		IDCS_RENEG		Receipt of DL_NOTE_CAPAB_RENEG
+ * IDCS_OK		IDCS_RESET_SENT		ill_capability_reset
+ * IDCS_RESET_SENT	IDCS_UNKNOWN		ill_capability_ack_thr
+ * IDCS_RENEG		IDCS_PROBE_SENT		ill_capability_ack_thr ->
+ *						    ill_capability_probe.
+ */
+
+/*
+ * Dedicated thread started from ip_stack_init that handles capability
+ * disable. This thread ensures the taskq dispatch does not fail by waiting
+ * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
+ * that direct calls to DLD are done in a cv_waitable context.
+ */
+void
+ill_taskq_dispatch(ip_stack_t *ipst)
+{
+	callb_cpr_t cprinfo;
+	char 	name[64];
+	mblk_t	*mp;
 
-	lso_subcap = (dl_capab_lso_t *)(dl_subcap + 1);
-	lso_subcap->lso_version = ill->ill_lso_capab->ill_lso_version;
-	lso_subcap->lso_flags = 0;
+	(void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
+	    ipst->ips_netstack->netstack_stackid);
+	CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
+	    name);
+	mutex_enter(&ipst->ips_capab_taskq_lock);
 
-	if (*sc_mp != NULL)
-		linkb(*sc_mp, mp);
-	else
-		*sc_mp = mp;
+	for (;;) {
+		mp = list_head(&ipst->ips_capab_taskq_list);
+		while (mp != NULL) {
+			list_remove(&ipst->ips_capab_taskq_list, mp);
+			mutex_exit(&ipst->ips_capab_taskq_lock);
+			VERIFY(taskq_dispatch(system_taskq,
+			    ill_capability_ack_thr, mp, TQ_SLEEP) != 0);
+			mutex_enter(&ipst->ips_capab_taskq_lock);
+			mp = list_head(&ipst->ips_capab_taskq_list);
+		}
+
+		if (ipst->ips_capab_taskq_quit)
+			break;
+		CALLB_CPR_SAFE_BEGIN(&cprinfo);
+		cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
+		CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
+	}
+	VERIFY(list_head(&ipst->ips_capab_taskq_list) == NULL);
+	CALLB_CPR_EXIT(&cprinfo);
+	thread_exit();
 }
 
 /*
  * Consume a new-style hardware capabilities negotiation ack.
- * Called from ip_rput_dlpi_writer().
+ * Called via taskq on receipt of DL_CAPABBILITY_ACK.
  */
-void
-ill_capability_ack(ill_t *ill, mblk_t *mp)
+static void
+ill_capability_ack_thr(void *arg)
 {
+	mblk_t	*mp = arg;
 	dl_capability_ack_t *capp;
 	dl_capability_sub_t *subp, *endp;
+	ill_t	*ill;
+	boolean_t reneg;
 
-	if (ill->ill_dlpi_capab_state == IDS_INPROGRESS)
-		ill->ill_dlpi_capab_state = IDS_OK;
+	ill = (ill_t *)mp->b_prev;
+	VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
+
+	if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
+	    ill->ill_dlpi_capab_state == IDCS_RENEG) {
+		/*
+		 * We have received the ack for our DL_CAPAB reset request.
+		 * There isnt' anything in the message that needs processing.
+		 * All message based capabilities have been disabled, now
+		 * do the function call based capability disable.
+		 */
+		reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
+		ill_capability_dld_disable(ill);
+		ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
+		if (reneg)
+			ill_capability_probe(ill);
+		goto done;
+	}
+
+	if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
+		ill->ill_dlpi_capab_state = IDCS_OK;
 
 	capp = (dl_capability_ack_t *)mp->b_rptr;
 
-	if (capp->dl_sub_length == 0)
+	if (capp->dl_sub_length == 0) {
 		/* no new-style capabilities */
-		return;
+		goto done;
+	}
 
 	/* make sure the driver supplied correct dl_sub_length */
 	if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
 		ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
 		    "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
-		return;
+		goto done;
 	}
 
+
 #define	SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
 	/*
 	 * There are sub-capabilities. Process the ones we know about.
@@ -3582,6 +3342,34 @@ ill_capability_ack(ill_t *ill, mblk_t *mp)
 		}
 	}
 #undef SC
+done:
+	inet_freemsg(mp);
+	ill_capability_done(ill);
+	ipsq_exit(ill->ill_phyint->phyint_ipsq);
+}
+
+/*
+ * This needs to be started in a taskq thread to provide a cv_waitable
+ * context.
+ */
+void
+ill_capability_ack(ill_t *ill, mblk_t *mp)
+{
+	ip_stack_t	*ipst = ill->ill_ipst;
+
+	mp->b_prev = (mblk_t *)ill;
+	if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
+	    TQ_NOSLEEP) != 0)
+		return;
+
+	/*
+	 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
+	 * which will do the dispatch using TQ_SLEEP to guarantee success.
+	 */
+	mutex_enter(&ipst->ips_capab_taskq_lock);
+	list_insert_tail(&ipst->ips_capab_taskq_list, mp);
+	cv_signal(&ipst->ips_capab_taskq_cv);
+	mutex_exit(&ipst->ips_capab_taskq_lock);
 }
 
 /*
@@ -7609,7 +7397,7 @@ ipsq_dq(ipsq_t *ipsq)
  */
 #define	ENTER_SQ_WAIT_TICKS 100
 boolean_t
-ipsq_enter(ill_t *ill, boolean_t force)
+ipsq_enter(ill_t *ill, boolean_t force, int type)
 {
 	ipsq_t	*ipsq;
 	boolean_t waited_enough = B_FALSE;
@@ -7630,7 +7418,8 @@ ipsq_enter(ill_t *ill, boolean_t force)
 		ipsq = ill->ill_phyint->phyint_ipsq;
 		mutex_enter(&ipsq->ipsq_lock);
 		if (ipsq->ipsq_writer == NULL &&
-		    (ipsq->ipsq_current_ipif == NULL || waited_enough)) {
+		    (type == CUR_OP || ipsq->ipsq_current_ipif == NULL ||
+		    waited_enough)) {
 			break;
 		} else if (ipsq->ipsq_writer != NULL) {
 			mutex_exit(&ipsq->ipsq_lock);
@@ -7661,6 +7450,18 @@ ipsq_enter(ill_t *ill, boolean_t force)
 	return (B_TRUE);
 }
 
+boolean_t
+ill_perim_enter(ill_t *ill)
+{
+	return (ipsq_enter(ill, B_FALSE, CUR_OP));
+}
+
+void
+ill_perim_exit(ill_t *ill)
+{
+	ipsq_exit(ill->ill_phyint->phyint_ipsq);
+}
+
 /*
  * The ipsq_t (ipsq) is the synchronization data structure used to serialize
  * certain critical operations like plumbing (i.e. most set ioctls),
@@ -9984,6 +9785,13 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
 		ill->ill_ip_muxid = islink ? li->l_index : 0;
 
 	/*
+	 * Mark the ipsq busy until the capability operations initiated below
+	 * complete. The PLINK/UNLINK ioctl itself completes when our caller
+	 * returns, but the capability operation may complete asynchronously
+	 * much later.
+	 */
+	ipsq_current_start(ipsq, ill->ill_ipif, ioccmd);
+	/*
 	 * If there's at least one up ipif on this ill, then we're bound to
 	 * the underlying driver via DLPI.  In that case, renegotiate
 	 * capabilities to account for any possible change in modules
@@ -9993,8 +9801,9 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
 		if (islink)
 			ill_capability_probe(ill);
 		else
-			ill_capability_reset(ill);
+			ill_capability_reset(ill, B_FALSE);
 	}
+	ipsq_current_finish(ipsq);
 
 	if (entered_ipsq)
 		ipsq_exit(ipsq);
@@ -18244,19 +18053,19 @@ ill_dl_down(ill_t *ill)
 		ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS;
 		mutex_exit(&ill->ill_lock);
 		/*
-		 * Reset the capabilities if the negotiation is done or is
-		 * still in progress. Note that ill_capability_reset() will
-		 * set ill_dlpi_capab_state to IDS_UNKNOWN, so the subsequent
-		 * DL_CAPABILITY_ACK and DL_NOTE_CAPAB_RENEG will be ignored.
-		 *
-		 * Further, reset ill_capab_reneg to be B_FALSE so that the
-		 * subsequent DL_CAPABILITY_ACK can be ignored, to prevent
-		 * the capabilities renegotiation from happening.
+		 * ip_rput does not pass up normal (M_PROTO) DLPI messages
+		 * after ILL_CONDEMNED is set. So in the unplumb case, we call
+		 * ill_capability_dld_disable disable rightaway. If this is not
+		 * an unplumb operation then the disable happens on receipt of
+		 * the capab ack via ip_rput_dlpi_writer ->
+		 * ill_capability_ack_thr. In both cases the order of
+		 * the operations seen by DLD is capability disable followed
+		 * by DL_UNBIND. Also the DLD capability disable needs a
+		 * cv_wait'able context.
 		 */
-		if (ill->ill_dlpi_capab_state != IDS_UNKNOWN)
-			ill_capability_reset(ill);
-		ill->ill_capab_reneg = B_FALSE;
-
+		if (ill->ill_state_flags & ILL_CONDEMNED)
+			ill_capability_dld_disable(ill);
+		ill_capability_reset(ill, B_FALSE);
 		ill_dlpi_send(ill, mp);
 	}
 
@@ -18314,7 +18123,6 @@ ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
 		ill->ill_dlpi_pending = prim;
 	}
 	mutex_exit(&ill->ill_lock);
-
 	putnext(ill->ill_wq, mp);
 }
 
@@ -18372,6 +18180,26 @@ ill_dlpi_send(ill_t *ill, mblk_t *mp)
 	ill_dlpi_dispatch(ill, mp);
 }
 
+static void
+ill_capability_send(ill_t *ill, mblk_t *mp)
+{
+	ill->ill_capab_pending_cnt++;
+	ill_dlpi_send(ill, mp);
+}
+
+void
+ill_capability_done(ill_t *ill)
+{
+	ASSERT(ill->ill_capab_pending_cnt != 0);
+
+	ill_dlpi_done(ill, DL_CAPABILITY_REQ);
+
+	ill->ill_capab_pending_cnt--;
+	if (ill->ill_capab_pending_cnt == 0 &&
+	    ill->ill_dlpi_capab_state == IDCS_OK)
+		ill_capability_reset_alloc(ill);
+}
+
 /*
  * Send all deferred DLPI messages without waiting for their ACKs.
  */
diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c
index 2e940057f0..405cb653d5 100644
--- a/usr/src/uts/common/inet/ip/ip_ire.c
+++ b/usr/src/uts/common/inet/ip/ip_ire.c
@@ -4277,6 +4277,37 @@ ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl,
 	return (NULL);
 }
 
+ire_t *
+ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
+{
+	irb_t *irb_ptr;
+	ire_t *ire;
+
+	/*
+	 * Lets look for an ire in the cachetable whose
+	 * ire_addr matches the destination.
+	 * Since we are being called by forwarding fastpath
+	 * no need to check for Trusted Solaris label.
+	 */
+	irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(
+	    dst, ipst->ips_ip_cache_table_size)];
+	rw_enter(&irb_ptr->irb_lock, RW_READER);
+	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
+		if (ire->ire_marks & (IRE_MARK_CONDEMNED |
+		    IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) {
+			continue;
+		}
+		if (ire->ire_addr == dst) {
+			IRE_REFHOLD(ire);
+			rw_exit(&irb_ptr->irb_lock);
+			return (ire);
+		}
+	}
+	rw_exit(&irb_ptr->irb_lock);
+	return (NULL);
+}
+
+
 /*
  * Locate the interface ire that is tied to the cache ire 'cire' via
  * cire->ire_ihandle.
diff --git a/usr/src/uts/common/inet/ip/ip_mroute.c b/usr/src/uts/common/inet/ip/ip_mroute.c
index 34fd3cd765..ac14adf00d 100644
--- a/usr/src/uts/common/inet/ip/ip_mroute.c
+++ b/usr/src/uts/common/inet/ip/ip_mroute.c
@@ -28,8 +28,6 @@
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Procedures for the kernel part of DVMRP,
  * a Distance-Vector Multicast Routing Protocol.
@@ -683,7 +681,7 @@ ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
 					vifp->v_marks &= ~VIF_MARK_GOOD;
 					vifp->v_marks |= VIF_MARK_CONDEMNED;
 					mutex_exit(&(vifp)->v_lock);
-					suc = ipsq_enter(ill, B_FALSE);
+					suc = ipsq_enter(ill, B_FALSE, NEW_OP);
 					ipsq = ill->ill_phyint->phyint_ipsq;
 				} else {
 					ipsq = ipsq_try_enter(ipif, NULL,
diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c
index 7a036a34d9..f3c95ae362 100644
--- a/usr/src/uts/common/inet/ip/ip_multi.c
+++ b/usr/src/uts/common/inet/ip/ip_multi.c
@@ -1201,7 +1201,7 @@ ipsq_enter_byifindex(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
 			return (NULL);
 		}
 		ill_refrele(ill);
-		in_ipsq = ipsq_enter(ill, B_FALSE);
+		in_ipsq = ipsq_enter(ill, B_FALSE, NEW_OP);
 		ill_waiter_dcr(ill);
 		if (!in_ipsq)
 			ill = NULL;
@@ -3912,7 +3912,7 @@ retry:
 		 * be refheld for cleanup by those routines and it would be
 		 * a mutual deadlock.
 		 */
-		success = ipsq_enter(ill, B_FALSE);
+		success = ipsq_enter(ill, B_FALSE, NEW_OP);
 		ipsq = ill->ill_phyint->phyint_ipsq;
 		ill_waiter_dcr(ill);
 		mutex_enter(&connp->conn_lock);
diff --git a/usr/src/uts/common/inet/ip/ip_netinfo.c b/usr/src/uts/common/inet/ip/ip_netinfo.c
index a34b55693e..53665593be 100644
--- a/usr/src/uts/common/inet/ip/ip_netinfo.c
+++ b/usr/src/uts/common/inet/ip/ip_netinfo.c
@@ -1546,7 +1546,7 @@ ip_ni_queue_func_impl(injection_t *inject,  boolean_t out)
 		if (inject->inj_isv6) {
 			ip_rput_v6(ill->ill_rq, packet->ni_packet);
 		} else {
-			ip_input(ill, NULL, packet->ni_packet, 0);
+			ip_input(ill, NULL, packet->ni_packet, NULL);
 		}
 		kmem_free(inject, sizeof (*inject));
 		ill_refrele(ill);
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index c2b22ab956..9d677c3157 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -26,16 +26,36 @@
 /*
  * IP interface to squeues.
  *
- * IP creates an squeue instance for each CPU. The squeue pointer is saved in
- * cpu_squeue field of the cpu structure. Each squeue is associated with a
- * connection instance (conn_t).
+ * IP uses squeues to force serialization of packets, both incoming and
+ * outgoing. Each squeue is associated with a connection instance (conn_t)
+ * above, and a soft ring (if enabled) below. Each CPU will have a default
+ * squeue for outbound connections, and each soft ring of an interface will
+ * have an squeue to which it sends incoming packets. squeues are never
+ * destroyed, and if they become unused they are kept around against future
+ * needs.
  *
- * For CPUs available at system startup time the squeue creation and association
- * with CPU happens at MP initialization time. For CPUs added during dynamic
- * reconfiguration, the initialization happens when the new CPU is configured in
- * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
- * return per-CPU squeue or random squeue based on the ip_squeue_fanout
- * variable.
+ * IP organizes its squeues using squeue sets (squeue_set_t). For each CPU
+ * in the system there will be one squeue set, all of whose squeues will be
+ * bound to that CPU, plus one additional set known as the unbound set. Sets
+ * associated with CPUs will have one default squeue, for outbound
+ * connections, and a linked list of squeues used by various NICs for inbound
+ * packets. The unbound set also has a linked list of squeues, but no default
+ * squeue.
+ *
+ * When a CPU goes offline its squeue set is destroyed, and all its squeues
+ * are moved to the unbound set. When a CPU comes online, a new squeue set is
+ * created and the default set is searched for a default squeue formerly bound
+ * to this CPU. If no default squeue is found, a new one is created.
+ *
+ * Two fields of the squeue_t, namely sq_next and sq_set, are owned by IP
+ * and not the squeue code. squeue.c will not touch them, and we can modify
+ * them without holding the squeue lock because of the guarantee that squeues
+ * are never destroyed. ip_squeue locks must be held, however.
+ *
+ * All the squeue sets are protected by a single lock, the sqset_lock. This
+ * is also used to protect the sq_next and sq_set fields of an squeue_t.
+ *
+ * The lock order is: cpu_lock --> ill_lock --> sqset_lock --> sq_lock
  *
  * There are two modes of associating connection with squeues. The first mode
  * associates each connection with the CPU that creates the connection (either
@@ -50,18 +70,13 @@
  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
  * binding is only relevant for the worker thread.
  *
- * The list of all created squeues is kept in squeue_set structure. This list is
- * used when ip_squeue_fanout is set and the load is distributed across all
- * squeues.
- *
  * INTERFACE:
  *
- * squeue_t *ip_squeue_get(hint)
+ * squeue_t *ip_squeue_get(ill_rx_ring_t)
  *
- * 	Find an squeue based on the 'hint' value. The hint is used as an index
- * 	in the array of IP squeues available. The way hint is computed may
- * 	affect the effectiveness of the squeue distribution. Currently squeues
- * 	are assigned in round-robin fashion using lbolt as a hint.
+ * Returns the squeue associated with an ill receive ring. If the ring is
+ * not bound to a CPU, and we're currently servicing the interrupt which
+ * generated the packet, then bind the squeue to CPU.
  *
  *
  * DR Notes
@@ -78,36 +93,31 @@
  * o When the CPU is going online, it creates a new squeue for this CPU if
  *	necessary and binds the squeue worker thread to this CPU.
  *
- * TUNEBALES:
- *
- * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
- * 	associated with an squeue instance.
+ * TUNABLES:
  *
- * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
- *	should be compiled with SQUEUE_PROFILE enabled for this variable to have
- *	an impact.
+ * ip_squeue_fanout: used when TCP calls IP_SQUEUE_GET(). If 1, then
+ * pick the default squeue from a random CPU, otherwise use our CPU's default
+ * squeue.
  *
- * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
- *	otherwise get it from CPU->cpu_squeue.
+ * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or
+ * /dev/ip.
  *
- * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
- * changed using ndd on /dev/tcp or /dev/ip.
- *
- * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
- *	created. This is the time squeue code waits before waking up the worker
- *	thread after queuing a request.
+ * ip_squeue_worker_wait: global value for the sq_wait field for all squeues *
+ * created. This is the time squeue code waits before waking up the worker
+ * thread after queuing a request.
  */
 
 #include <sys/types.h>
 #include <sys/debug.h>
 #include <sys/kmem.h>
 #include <sys/cpuvar.h>
-
 #include <sys/cmn_err.h>
 
 #include <inet/common.h>
 #include <inet/ip.h>
+#include <netinet/ip6.h>
 #include <inet/ip_if.h>
+#include <inet/ip_ire.h>
 #include <inet/nd.h>
 #include <inet/ipclassifier.h>
 #include <sys/types.h>
@@ -115,31 +125,21 @@
 #include <sys/sunddi.h>
 #include <sys/dlpi.h>
 #include <sys/squeue_impl.h>
+#include <sys/tihdr.h>
+#include <inet/udp_impl.h>
+#include <sys/strsubr.h>
+#include <sys/zone.h>
+#include <sys/dld.h>
 #include <sys/atomic.h>
 
 /*
- * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
- * mapping between squeue and NIC (or Rx ring) for performance reasons so
- * each squeue can uniquely own a NIC or a Rx ring and do polling
- * (PSARC 2004/630). So we allow up to  MAX_SQUEUES_PER_CPU squeues per CPU.
- * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
- * can be created dynamically as needed.
+ * List of all created squeue sets. The list and its size are protected by
+ * sqset_lock.
  */
-#define	MAX_SQUEUES_PER_CPU	32
-#define	MIN_SQUEUES_PER_CPU	1
-uint_t	ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
+static squeue_set_t	**sqset_global_list; /* list 0 is the unbound list */
+static uint_t		sqset_global_size;
+kmutex_t		sqset_lock;
 
-#define	IP_NUM_SOFT_RINGS	2
-uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
-
-/*
- * List of all created squeue sets. The size is protected by cpu_lock
- */
-squeue_set_t	**sqset_global_list;
-uint_t		sqset_global_size;
-
-int ip_squeue_bind = B_TRUE;
-int ip_squeue_profile = B_TRUE;
 static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
 
 /*
@@ -149,82 +149,153 @@ static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
  */
 uint_t ip_squeue_worker_wait = 10;
 
-static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
+static squeue_t *ip_squeue_create(pri_t);
+static squeue_set_t *ip_squeue_set_create(processorid_t);
 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
-
-static void ip_squeue_set_bind(squeue_set_t *);
-static void ip_squeue_set_unbind(squeue_set_t *);
-static squeue_t *ip_find_unused_squeue(squeue_set_t *, boolean_t);
+static void ip_squeue_set_move(squeue_t *, squeue_set_t *);
+static void ip_squeue_set_destroy(cpu_t *);
 static void ip_squeue_clean(void *, mblk_t *, void *);
-static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *);
 
 #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
 
+static squeue_t *
+ip_squeue_create(pri_t pri)
+{
+	squeue_t *sqp;
+
+	sqp = squeue_create(ip_squeue_worker_wait, pri);
+	ASSERT(sqp != NULL);
+	if (ip_squeue_create_callback != NULL)
+		ip_squeue_create_callback(sqp);
+	return (sqp);
+}
+
 /*
- * Create squeue set containing ip_squeues_per_cpu number of squeues
- * for this CPU and bind them all to the CPU.
+ * Create a new squeue_set. If id == -1, then we're creating the unbound set,
+ * which should only happen once when we are first initialized. Otherwise id
+ * is the id of the CPU that needs a set, either because we are initializing
+ * or because the CPU has come online.
+ *
+ * If id != -1, then we need at a minimum to provide a default squeue for the
+ * new set. We search the unbound set for candidates, and if none are found we
+ * create a new one.
  */
 static squeue_set_t *
-ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
+ip_squeue_set_create(processorid_t id)
 {
-	int i;
 	squeue_set_t	*sqs;
-	squeue_t 	*sqp;
-	char 		sqname[64];
-	processorid_t 	id = cp->cpu_id;
+	squeue_set_t	*src = sqset_global_list[0];
+	squeue_t	**lastsqp, *sq;
+	squeue_t	**defaultq_lastp = NULL;
+
+	sqs = kmem_zalloc(sizeof (squeue_set_t), KM_SLEEP);
+	sqs->sqs_cpuid = id;
+
+	if (id == -1) {
+		ASSERT(sqset_global_size == 0);
+		sqset_global_list[0] = sqs;
+		sqset_global_size = 1;
+		return (sqs);
+	}
 
-	if (reuse) {
-		int i;
+	/*
+	 * When we create an squeue set id != -1, we need to give it a
+	 * default squeue, in order to support fanout of conns across
+	 * CPUs. Try to find a former default squeue that matches this
+	 * cpu id on the unbound squeue set. If no such squeue is found,
+	 * find some non-default TCP squeue and steal it. If still no such
+	 * candidate is found, create a new squeue.
+	 */
 
-		/*
-		 * We may already have an squeue created for this CPU. Try to
-		 * find one and reuse it if possible.
-		 */
-		for (i = 0; i < sqset_global_size; i++) {
-			sqs = sqset_global_list[i];
-			if (id == sqs->sqs_bind)
-				return (sqs);
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	mutex_enter(&sqset_lock);
+	lastsqp = &src->sqs_head;
+
+	while (*lastsqp) {
+		if ((*lastsqp)->sq_bind == id &&
+		    (*lastsqp)->sq_state & SQS_DEFAULT) {
+			defaultq_lastp = lastsqp;
+			break;
+		}
+		if (defaultq_lastp == NULL &&
+		    !((*lastsqp)->sq_state & SQS_DEFAULT)) {
+			defaultq_lastp = lastsqp;
 		}
+		lastsqp = &(*lastsqp)->sq_next;
+
+	}
+	if (defaultq_lastp) {
+		/* Remove from src set and set SQS_DEFAULT */
+		sq = *defaultq_lastp;
+		*defaultq_lastp = sq->sq_next;
+		sq->sq_next = NULL;
+		if (!(sq->sq_state & SQS_DEFAULT)) {
+			mutex_enter(&sq->sq_lock);
+			sq->sq_state |= SQS_DEFAULT;
+			mutex_exit(&sq->sq_lock);
+		}
+	} else {
+		sq = ip_squeue_create(SQUEUE_DEFAULT_PRIORITY);
+		sq->sq_state |= SQS_DEFAULT;
 	}
 
-	sqs = kmem_zalloc(sizeof (squeue_set_t) +
-	    (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
-	mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
-	sqs->sqs_list = (squeue_t **)&sqs[1];
-	sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
-	sqs->sqs_bind = id;
+	sq->sq_set = sqs;
+	sqs->sqs_default = sq;
+	squeue_bind(sq, id); /* this locks squeue mutex */
 
-	for (i = 0; i < ip_squeues_per_cpu; i++) {
-		bzero(sqname, sizeof (sqname));
+	ASSERT(sqset_global_size <= NCPU);
+	sqset_global_list[sqset_global_size++] = sqs;
+	mutex_exit(&sqset_lock);
+	return (sqs);
+}
 
-		(void) snprintf(sqname, sizeof (sqname),
-		    "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
-		    cp->cpu_id, i);
+/*
+ * Called by ill_ring_add() to find an squeue to associate with a new ring.
+ */
 
-		sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
-		    minclsyspri);
+squeue_t *
+ip_squeue_getfree(pri_t pri)
+{
+	squeue_set_t	*sqs = sqset_global_list[0];
+	squeue_t	*sq;
 
+	mutex_enter(&sqset_lock);
+	for (sq = sqs->sqs_head; sq != NULL; sq = sq->sq_next) {
 		/*
-		 * The first squeue in each squeue_set is the DEFAULT
-		 * squeue.
+		 * Select a non-default squeue
 		 */
-		sqp->sq_state |= SQS_DEFAULT;
+		if (!(sq->sq_state & (SQS_DEFAULT | SQS_ILL_BOUND)))
+			break;
+	}
 
-		ASSERT(sqp != NULL);
+	if (sq == NULL) {
+		sq = ip_squeue_create(pri);
+		sq->sq_set = sqs;
+		sq->sq_next = sqs->sqs_head;
+		sqs->sqs_head = sq;
+	}
 
-		squeue_profile_enable(sqp);
-		sqs->sqs_list[sqs->sqs_size++] = sqp;
+	ASSERT(!(sq->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL |
+	    SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
+	    SQS_POLL_THR_QUIESCED)));
 
-		if (ip_squeue_create_callback != NULL)
-			ip_squeue_create_callback(sqp);
-	}
+	mutex_enter(&sq->sq_lock);
+	sq->sq_state |= SQS_ILL_BOUND;
+	mutex_exit(&sq->sq_lock);
+	mutex_exit(&sqset_lock);
 
-	if (ip_squeue_bind && cpu_is_online(cp))
-		ip_squeue_set_bind(sqs);
+	if (sq->sq_priority != pri) {
+		thread_lock(sq->sq_worker);
+		(void) thread_change_pri(sq->sq_worker, pri, 0);
+		thread_unlock(sq->sq_worker);
 
-	sqset_global_list[sqset_global_size++] = sqs;
-	ASSERT(sqset_global_size <= NCPU);
-	return (sqs);
+		thread_lock(sq->sq_poll_thr);
+		(void) thread_change_pri(sq->sq_poll_thr, pri, 0);
+		thread_unlock(sq->sq_poll_thr);
+
+		sq->sq_priority = pri;
+	}
+	return (sq);
 }
 
 /*
@@ -234,876 +305,450 @@ void
 ip_squeue_init(void (*callback)(squeue_t *))
 {
 	int i;
+	squeue_set_t	*sqs;
 
 	ASSERT(sqset_global_list == NULL);
 
-	if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
-		ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
-	else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
-		ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
-
 	ip_squeue_create_callback = callback;
 	squeue_init();
+	mutex_init(&sqset_lock, NULL, MUTEX_DEFAULT, NULL);
 	sqset_global_list =
-	    kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
+	    kmem_zalloc(sizeof (squeue_set_t *) * (NCPU+1), KM_SLEEP);
 	sqset_global_size = 0;
-	mutex_enter(&cpu_lock);
+	/*
+	 * We are called at system boot time and we don't
+	 * expect memory allocation failure.
+	 */
+	sqs = ip_squeue_set_create(-1);
+	ASSERT(sqs != NULL);
 
+	mutex_enter(&cpu_lock);
 	/* Create squeue for each active CPU available */
 	for (i = 0; i < NCPU; i++) {
-		cpu_t *cp = cpu[i];
+		cpu_t *cp = cpu_get(i);
 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
-			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
+			/*
+			 * We are called at system boot time and we don't
+			 * expect memory allocation failure then
+			 */
+			cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
+			ASSERT(cp->cpu_squeue_set != NULL);
 		}
 	}
 
 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
-
 	mutex_exit(&cpu_lock);
-
-	if (ip_squeue_profile)
-		squeue_profile_start();
 }
 
 /*
- * Get squeue_t structure based on index.
- * Since the squeue list can only grow, no need to grab any lock.
+ * Get a default squeue, either from the current CPU or a CPU derived by hash
+ * from the index argument, depending upon the setting of ip_squeue_fanout.
  */
 squeue_t *
 ip_squeue_random(uint_t index)
 {
-	squeue_set_t *sqs;
-
-	sqs = sqset_global_list[index % sqset_global_size];
-	return (sqs->sqs_list[index % sqs->sqs_size]);
-}
-
-/* ARGSUSED */
-static void
-ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
-{
-	squeue_t	*sqp = arg2;
-	ill_rx_ring_t	*ring = (ill_rx_ring_t *)mp->b_wptr;
-	ill_t		*ill;
-
-	ASSERT(sqp != NULL);
-	mp->b_wptr = NULL;
-
-	if (ring == NULL) {
-		return;
-	}
+	squeue_set_t *sqs = NULL;
+	squeue_t *sq;
 
 	/*
-	 * Clean up squeue
+	 * The minimum value of sqset_global_size is 2, one for the unbound
+	 * squeue set and another for the squeue set of the zeroth CPU.
+	 * Even though the value could be changing, it can never go below 2,
+	 * so the assert does not need the lock protection.
 	 */
-	mutex_enter(&sqp->sq_lock);
-	sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
-	sqp->sq_rx_ring = NULL;
-	mutex_exit(&sqp->sq_lock);
+	ASSERT(sqset_global_size > 1);
 
-	ill = ring->rr_ill;
-	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
-		ASSERT(ring->rr_handle != NULL);
-		ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
-	}
+	/* Protect against changes to sqset_global_list */
+	mutex_enter(&sqset_lock);
 
-	/*
-	 * Cleanup the ring
-	 */
-
-	ring->rr_blank = NULL;
-	ring->rr_handle = NULL;
-	ring->rr_sqp = NULL;
+	if (!ip_squeue_fanout)
+		sqs = CPU->cpu_squeue_set;
 
 	/*
-	 * Signal ill that cleanup is done
+	 * sqset_global_list[0] corresponds to the unbound squeue set.
+	 * The computation below picks a set other than the unbound set.
 	 */
-	mutex_enter(&ill->ill_lock);
-	ring->rr_ring_state = ILL_RING_FREE;
-	cv_signal(&ill->ill_cv);
-	mutex_exit(&ill->ill_lock);
+	if (sqs == NULL)
+		sqs = sqset_global_list[(index % (sqset_global_size - 1)) + 1];
+	sq = sqs->sqs_default;
+
+	mutex_exit(&sqset_lock);
+	ASSERT(sq);
+	return (sq);
 }
 
 /*
- * Clean up one squeue element. ill_inuse_ref is protected by ill_lock.
- * The real cleanup happens behind the squeue via ip_squeue_clean function but
- * we need to protect ourselves from 2 threads trying to cleanup at the same
- * time (possible with one port going down for aggr and someone tearing down the
- * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock
- * to indicate when the cleanup has started (1 ref) and when the cleanup
- * is done (0 ref). When a new ring gets assigned to squeue, we start by
- * putting 2 ref on ill_inuse_ref.
+ * Move squeue from its current set to newset. Not used for default squeues.
+ * Bind or unbind the worker thread as appropriate.
  */
+
 static void
-ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
+ip_squeue_set_move(squeue_t *sq, squeue_set_t *newset)
 {
-	conn_t *connp;
-	squeue_t *sqp;
-	mblk_t *mp;
-
-	ASSERT(rx_ring != NULL);
+	squeue_set_t	*set;
+	squeue_t	**lastsqp;
+	processorid_t	cpuid = newset->sqs_cpuid;
 
-	/* Just clean one squeue */
-	mutex_enter(&ill->ill_lock);
-	/*
-	 * Reset the ILL_SOFT_RING_ASSIGN bit so that
-	 * ip_squeue_soft_ring_affinty() will not go
-	 * ahead with assigning rings.
-	 */
-	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
-	while (rx_ring->rr_ring_state == ILL_RING_INPROC)
-		/* Some operations pending on the ring. Wait */
-		cv_wait(&ill->ill_cv, &ill->ill_lock);
-
-	if (rx_ring->rr_ring_state != ILL_RING_INUSE) {
-		/*
-		 * Someone already trying to clean
-		 * this squeue or it's already been cleaned.
-		 */
-		mutex_exit(&ill->ill_lock);
-		return;
-	}
-	sqp = rx_ring->rr_sqp;
+	ASSERT(!(sq->sq_state & SQS_DEFAULT));
+	ASSERT(!MUTEX_HELD(&sq->sq_lock));
+	ASSERT(MUTEX_HELD(&sqset_lock));
 
-	if (sqp == NULL) {
-		/*
-		 * The rx_ring never had a squeue assigned to it.
-		 * We are under ill_lock so we can clean it up
-		 * here itself since no one can get to it.
-		 */
-		rx_ring->rr_blank = NULL;
-		rx_ring->rr_handle = NULL;
-		rx_ring->rr_sqp = NULL;
-		rx_ring->rr_ring_state = ILL_RING_FREE;
-		mutex_exit(&ill->ill_lock);
+	set = sq->sq_set;
+	if (set == newset)
 		return;
-	}
-
-	/* Indicate that it's being cleaned */
-	rx_ring->rr_ring_state = ILL_RING_BEING_FREED;
-	ASSERT(sqp != NULL);
-	mutex_exit(&ill->ill_lock);
 
-	/*
-	 * Use the preallocated ill_unbind_conn for this purpose
-	 */
-	connp = ill->ill_dls_capab->ill_unbind_conn;
-
-	if (connp->conn_tcp->tcp_closemp.b_prev == NULL) {
-		connp->conn_tcp->tcp_closemp_used = B_TRUE;
-	} else {
-		cmn_err(CE_PANIC, "ip_squeue_clean_ring: "
-		    "concurrent use of tcp_closemp_used: connp %p tcp %p\n",
-		    (void *)connp, (void *)connp->conn_tcp);
-	}
-
-	TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15);
-	mp = &connp->conn_tcp->tcp_closemp;
-	CONN_INC_REF(connp);
-
-	/*
-	 * Since the field sq_rx_ring for default squeue is NULL,
-	 * ip_squeue_clean() will have no way to get the ring if we
-	 * don't pass the pointer to it. We use b_wptr to do so
-	 * as use of b_wptr for any other purpose is not expected.
-	 */
-
-	ASSERT(mp->b_wptr == NULL);
-	mp->b_wptr = (unsigned char *)rx_ring;
-	squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL);
-
-	mutex_enter(&ill->ill_lock);
-	while (rx_ring->rr_ring_state != ILL_RING_FREE)
-		cv_wait(&ill->ill_cv, &ill->ill_lock);
-	mutex_exit(&ill->ill_lock);
+	lastsqp = &set->sqs_head;
+	while (*lastsqp != sq)
+		lastsqp = &(*lastsqp)->sq_next;
+
+	*lastsqp = sq->sq_next;
+	sq->sq_next = newset->sqs_head;
+	newset->sqs_head = sq;
+	sq->sq_set = newset;
+	if (cpuid == -1)
+		squeue_unbind(sq);
+	else
+		squeue_bind(sq, cpuid);
 }
 
-void
-ip_squeue_clean_all(ill_t *ill)
+/*
+ * Move squeue from its current set to cpuid's set and bind to cpuid.
+ */
+
+int
+ip_squeue_cpu_move(squeue_t *sq, processorid_t cpuid)
 {
-	int idx;
+	cpu_t *cpu;
+	squeue_set_t *set;
 
-	/*
-	 * No need to clean if poll_capab isn't set for this ill
-	 */
-	if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)))
-		return;
+	if (sq->sq_state & SQS_DEFAULT)
+		return (-1);
 
-	for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
-		ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx];
+	ASSERT(MUTEX_HELD(&cpu_lock));
 
-		ip_squeue_clean_ring(ill, ipr);
-	}
+	cpu = cpu_get(cpuid);
+	if (!CPU_ISON(cpu))
+		return (-1);
 
-	ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING);
+	mutex_enter(&sqset_lock);
+	set = cpu->cpu_squeue_set;
+	if (set != NULL)
+		ip_squeue_set_move(sq, set);
+	mutex_exit(&sqset_lock);
+	return ((set == NULL) ? -1 : 0);
 }
 
-typedef struct ip_taskq_arg {
-	ill_t		*ip_taskq_ill;
-	ill_rx_ring_t	*ip_taskq_ill_rx_ring;
-	cpu_t		*ip_taskq_cpu;
-} ip_taskq_arg_t;
-
 /*
- * Do a Rx ring to squeue binding. Find a unique squeue that is not
- * managing a receive ring. If no such squeue exists, dynamically
- * create a new one in the squeue set.
- *
- * The function runs via the system taskq. The ill passed as an
- * argument can't go away since we hold a ref. The lock order is
- * ill_lock -> sqs_lock -> sq_lock.
- *
- * If we are binding a Rx ring to a squeue attached to the offline CPU,
- * no need to check that because squeues are never destroyed once
- * created.
+ * The mac layer is calling, asking us to move an squeue to a
+ * new CPU. This routine is called with cpu_lock held.
  */
-/* ARGSUSED */
-static void
-ip_squeue_extend(void *arg)
+void
+ip_squeue_bind_ring(ill_t *ill, ill_rx_ring_t *rx_ring, processorid_t cpuid)
 {
-	ip_taskq_arg_t	*sq_arg = (ip_taskq_arg_t *)arg;
-	ill_t		*ill = sq_arg->ip_taskq_ill;
-	ill_rx_ring_t	*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
-	cpu_t		*intr_cpu = sq_arg->ip_taskq_cpu;
-	squeue_set_t 	*sqs;
-	squeue_t 	*sqp = NULL;
-
-	ASSERT(ill != NULL);
-	ASSERT(ill_rx_ring != NULL);
-	kmem_free(arg, sizeof (ip_taskq_arg_t));
+	ASSERT(ILL_MAC_PERIM_HELD(ill));
+	ASSERT(rx_ring->rr_ill == ill);
 
-	/*
-	 * Make sure the CPU that originally took the interrupt still
-	 * exists.
-	 */
-	if (!CPU_ISON(intr_cpu))
-		intr_cpu = CPU;
-
-	sqs = intr_cpu->cpu_squeue_set;
-
-	/*
-	 * If this ill represents link aggregation, then there might be
-	 * multiple NICs trying to register them selves at the same time
-	 * and in order to ensure that test and assignment of free rings
-	 * is sequential, we need to hold the ill_lock.
-	 */
 	mutex_enter(&ill->ill_lock);
-	sqp = ip_find_unused_squeue(sqs, B_FALSE);
-	if (sqp == NULL) {
-		/*
-		 * We hit the max limit of squeues allowed per CPU.
-		 * Assign this rx_ring to DEFAULT squeue of the
-		 * interrupted CPU but the squeue will not manage
-		 * the ring. Also print a warning.
-		 */
-		cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
-		    "has max number of squeues. System performance might "
-		    "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
-
-		/* the first squeue in the list is the default squeue */
-		sqp = sqs->sqs_list[0];
-		ASSERT(sqp != NULL);
-		ill_rx_ring->rr_sqp = sqp;
-		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
-
+	if (rx_ring->rr_ring_state == RR_FREE ||
+	    rx_ring->rr_ring_state == RR_FREE_INPROG) {
 		mutex_exit(&ill->ill_lock);
-		ill_waiter_dcr(ill);
 		return;
 	}
 
-	ASSERT(MUTEX_HELD(&sqp->sq_lock));
-	sqp->sq_rx_ring = ill_rx_ring;
-	ill_rx_ring->rr_sqp = sqp;
-	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
-
-	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
-	mutex_exit(&sqp->sq_lock);
+	if (ip_squeue_cpu_move(rx_ring->rr_sqp, cpuid) != -1)
+		rx_ring->rr_ring_state = RR_SQUEUE_BOUND;
 
 	mutex_exit(&ill->ill_lock);
-
-	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
-	ill_waiter_dcr(ill);
 }
 
-/*
- * Do a Rx ring to squeue binding. Find a unique squeue that is not
- * managing a receive ring. If no such squeue exists, dynamically
- * create a new one in the squeue set.
- *
- * The function runs via the system taskq. The ill passed as an
- * argument can't go away since we hold a ref. The lock order is
- * ill_lock -> sqs_lock -> sq_lock.
- *
- * If we are binding a Rx ring to a squeue attached to the offline CPU,
- * no need to check that because squeues are never destroyed once
- * created.
- */
-/* ARGSUSED */
-static void
-ip_squeue_soft_ring_affinity(void *arg)
+void *
+ip_squeue_add_ring(ill_t *ill, void *mrp)
 {
-	ip_taskq_arg_t		*sq_arg = (ip_taskq_arg_t *)arg;
-	ill_t			*ill = sq_arg->ip_taskq_ill;
-	ill_dls_capab_t	*ill_soft_ring = ill->ill_dls_capab;
-	ill_rx_ring_t		*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
-	cpu_t			*intr_cpu = sq_arg->ip_taskq_cpu;
-	cpu_t			*bind_cpu;
-	int			cpu_id = intr_cpu->cpu_id;
-	int			min_cpu_id, max_cpu_id;
-	boolean_t		enough_uniq_cpus = B_FALSE;
-	boolean_t		enough_cpus = B_FALSE;
-	squeue_set_t 		*sqs, *last_sqs;
-	squeue_t 		*sqp = NULL;
-	int			i, j;
-
-	ASSERT(ill != NULL);
-	kmem_free(arg, sizeof (ip_taskq_arg_t));
+	mac_rx_fifo_t		*mrfp = (mac_rx_fifo_t *)mrp;
+	ill_rx_ring_t		*rx_ring, *ring_tbl;
+	int			ip_rx_index;
+	squeue_t		*sq = NULL;
+	pri_t			pri;
 
-	/*
-	 * Make sure the CPU that originally took the interrupt still
-	 * exists.
-	 */
-	if (!CPU_ISON(intr_cpu)) {
-		intr_cpu = CPU;
-		cpu_id = intr_cpu->cpu_id;
-	}
+	ASSERT(ILL_MAC_PERIM_HELD(ill));
+	ASSERT(mrfp->mrf_type == MAC_RX_FIFO);
+	ASSERT(ill->ill_dld_capab != NULL);
 
-	/*
-	 * If this ill represents link aggregation, then there might be
-	 * multiple NICs trying to register them selves at the same time
-	 * and in order to ensure that test and assignment of free rings
-	 * is sequential, we need to hold the ill_lock.
-	 */
-	mutex_enter(&ill->ill_lock);
+	ring_tbl = ill->ill_dld_capab->idc_poll.idp_ring_tbl;
 
-	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
-		mutex_exit(&ill->ill_lock);
-		return;
+	mutex_enter(&ill->ill_lock);
+	for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
+		rx_ring = &ring_tbl[ip_rx_index];
+		if (rx_ring->rr_ring_state == RR_FREE)
+			break;
 	}
-	/*
-	 * We need to fanout the interrupts from the NIC. We do that by
-	 * telling the driver underneath to create soft rings and use
-	 * worker threads (if the driver advertized SOFT_RING capability)
-	 * Its still a big performance win to if we can fanout to the
-	 * threads on the same core that is taking interrupts.
-	 *
-	 * Since we don't know the interrupt to CPU binding, we don't
-	 * assign any squeues or affinity to worker threads in the NIC.
-	 * At the time of the first interrupt, we know which CPU is
-	 * taking interrupts and try to find other threads on the same
-	 * core. Assuming, ip_threads_per_cpu is correct and cpus are
-	 * numbered sequentially for each core (XXX need something better
-	 * than this in future), find the lowest number and highest
-	 * number thread for that core.
-	 *
-	 * If we have one more thread per core than number of soft rings,
-	 * then don't assign any worker threads to the H/W thread (cpu)
-	 * taking interrupts (capability negotiation tries to ensure this)
-	 *
-	 * If the number of threads per core are same as the number of
-	 * soft rings, then assign the worker affinity and squeue to
-	 * the same cpu.
-	 *
-	 * Otherwise, just fanout to higher number CPUs starting from
-	 * the interrupted CPU.
-	 */
 
-	min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
-	max_cpu_id = min_cpu_id + ip_threads_per_cpu;
-
-	/*
-	 * Quickly check if there are enough CPUs present for fanout
-	 * and also max_cpu_id is less than the id of the active CPU.
-	 * We use the cpu_id stored in the last squeue_set to get
-	 * an idea. The scheme is by no means perfect since it doesn't
-	 * take into account CPU DR operations and the fact that
-	 * interrupts themselves might change. An ideal scenario
-	 * would be to ensure that interrupts run cpus by themselves
-	 * and worker threads never have affinity to those CPUs. If
-	 * the interrupts move to CPU which had a worker thread, it
-	 * should be changed. Probably callbacks similar to CPU offline
-	 * are needed to make it work perfectly.
-	 */
-	last_sqs = sqset_global_list[sqset_global_size - 1];
-	if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
-		if ((max_cpu_id - min_cpu_id) >
-		    ill_soft_ring->ill_dls_soft_ring_cnt)
-			enough_uniq_cpus = B_TRUE;
-		else if ((max_cpu_id - min_cpu_id) >=
-		    ill_soft_ring->ill_dls_soft_ring_cnt)
-			enough_cpus = B_TRUE;
+	if (ip_rx_index == ILL_MAX_RINGS) {
+		/*
+		 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
+		 * we have devices which can overwhelm this limit,
+		 * ILL_MAX_RING should be made configurable. Meanwhile it
+		 * cause no panic because driver will pass ip_input a NULL
+		 * handle which will make IP allocate the default squeue and
+		 * Polling mode will not be used for this ring.
+		 */
+		cmn_err(CE_NOTE,
+		    "Reached maximum number of receiving rings (%d) for %s\n",
+		    ILL_MAX_RINGS, ill->ill_name);
+		mutex_exit(&ill->ill_lock);
+		return (NULL);
 	}
 
-	j = 0;
-	for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
-		if (enough_uniq_cpus) {
-			if ((min_cpu_id + i) == cpu_id) {
-				j++;
-				continue;
-			}
-			bind_cpu = cpu[min_cpu_id + i];
-		} else if (enough_cpus) {
-			bind_cpu = cpu[min_cpu_id + i];
-		} else {
-			/* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
-			bind_cpu = cpu[(cpu_id + i) % ncpus];
-		}
+	bzero(rx_ring, sizeof (ill_rx_ring_t));
+	rx_ring->rr_rx = (ip_mac_rx_t)mrfp->mrf_receive;
+	/* XXX: Hard code it to tcp accept for now */
+	rx_ring->rr_ip_accept = (ip_accept_t)ip_accept_tcp;
 
-		/*
-		 * Check if the CPU actually exist and active. If not,
-		 * use the interrupted CPU. ip_find_unused_squeue() will
-		 * find the right CPU to fanout anyway.
-		 */
-		if (!CPU_ISON(bind_cpu))
-			bind_cpu = intr_cpu;
+	rx_ring->rr_intr_handle = mrfp->mrf_intr_handle;
+	rx_ring->rr_intr_enable = (ip_mac_intr_enable_t)mrfp->mrf_intr_enable;
+	rx_ring->rr_intr_disable =
+	    (ip_mac_intr_disable_t)mrfp->mrf_intr_disable;
+	rx_ring->rr_rx_handle = mrfp->mrf_rx_arg;
+	rx_ring->rr_ill = ill;
 
-		sqs = bind_cpu->cpu_squeue_set;
-		ASSERT(sqs != NULL);
-		ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
+	pri = mrfp->mrf_flow_priority;
 
-		sqp = ip_find_unused_squeue(sqs, B_TRUE);
-		if (sqp == NULL) {
-			/*
-			 * We hit the max limit of squeues allowed per CPU.
-			 * Assign this rx_ring to DEFAULT squeue of the
-			 * interrupted CPU but thesqueue will not manage
-			 * the ring. Also print a warning.
-			 */
-			cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
-			    "%d/%p already has max number of squeues. System "
-			    "performance might become suboptimal\n",
-			    sqs->sqs_bind, (void *)sqs);
+	sq = ip_squeue_getfree(pri);
 
-			/* the first squeue in the list is the default squeue */
-			sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
-			ASSERT(sqp != NULL);
+	mutex_enter(&sq->sq_lock);
+	sq->sq_rx_ring = rx_ring;
+	rx_ring->rr_sqp = sq;
 
-			ill_rx_ring->rr_sqp = sqp;
-			ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
-			continue;
+	sq->sq_state |= SQS_POLL_CAPAB;
 
-		}
-		ASSERT(MUTEX_HELD(&sqp->sq_lock));
-		ill_rx_ring->rr_sqp = sqp;
-		sqp->sq_rx_ring = ill_rx_ring;
-		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
-		sqp->sq_state |= SQS_ILL_BOUND;
-
-		/* assign affinity to soft ring */
-		if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
-			ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
-			    sqp->sq_bind);
-		}
-		mutex_exit(&sqp->sq_lock);
-	}
+	rx_ring->rr_ring_state = RR_SQUEUE_UNBOUND;
+	sq->sq_ill = ill;
+	mutex_exit(&sq->sq_lock);
 	mutex_exit(&ill->ill_lock);
 
-	ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
-	    SOFT_RING_FANOUT);
+	DTRACE_PROBE4(ill__ring__add, char *, ill->ill_name, ill_t *, ill, int,
+	    ip_rx_index, void *, mrfp->mrf_rx_arg);
 
-	mutex_enter(&ill->ill_lock);
-	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
-	mutex_exit(&ill->ill_lock);
+	/* Assign the squeue to the specified CPU as well */
+	mutex_enter(&cpu_lock);
+	(void) ip_squeue_bind_ring(ill, rx_ring, mrfp->mrf_cpu_id);
+	mutex_exit(&cpu_lock);
 
-	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
-	ill_waiter_dcr(ill);
+	return (rx_ring);
 }
 
-/* ARGSUSED */
+/*
+ * sanitize the squeue etc. Some of the processing
+ * needs to be done from inside the perimeter.
+ */
 void
-ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
-    mblk_t *mp_chain, struct mac_header_info_s *mhip)
+ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
 {
-	ip_taskq_arg_t	*taskq_arg;
-	boolean_t	refheld;
-
-	mutex_enter(&ill->ill_lock);
-	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
-		taskq_arg = (ip_taskq_arg_t *)
-		    kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
-
-		if (taskq_arg == NULL)
-			goto out;
+	squeue_t *sqp;
 
-		taskq_arg->ip_taskq_ill = ill;
-		taskq_arg->ip_taskq_ill_rx_ring = NULL;
-		taskq_arg->ip_taskq_cpu = CPU;
+	ASSERT(ILL_MAC_PERIM_HELD(ill));
+	ASSERT(rx_ring != NULL);
 
-		/*
-		 * Set ILL_SOFT_RING_ASSIGN flag. We don't want
-		 * the next interrupt to schedule a task for calling
-		 * ip_squeue_soft_ring_affinity();
-		 */
-		ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
-	} else {
+	/* Just clean one squeue */
+	mutex_enter(&ill->ill_lock);
+	if (rx_ring->rr_ring_state == RR_FREE) {
 		mutex_exit(&ill->ill_lock);
-		goto out;
+		return;
 	}
+	rx_ring->rr_ring_state = RR_FREE_INPROG;
+	sqp = rx_ring->rr_sqp;
+
+	mutex_enter(&sqp->sq_lock);
+	sqp->sq_state |= SQS_POLL_CLEANUP;
+	cv_signal(&sqp->sq_worker_cv);
 	mutex_exit(&ill->ill_lock);
-	refheld = ill_waiter_inc(ill);
-	if (refheld) {
-		if (taskq_dispatch(system_taskq,
-		    ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
-			goto out;
-
-		/* release ref on ill if taskq dispatch fails */
-		ill_waiter_dcr(ill);
-	}
+	while (!(sqp->sq_state & SQS_POLL_CLEANUP_DONE))
+		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
+	sqp->sq_state &= ~(SQS_POLL_CLEANUP_DONE | SQS_ILL_BOUND);
+
+	ASSERT(!(sqp->sq_state & (SQS_POLL_THR_CONTROL |
+	    SQS_WORKER_THR_CONTROL | SQS_POLL_QUIESCE_DONE |
+	    SQS_POLL_THR_QUIESCED)));
+
+	cv_signal(&sqp->sq_worker_cv);
+	mutex_exit(&sqp->sq_lock);
+
 	/*
-	 * Turn on CAPAB_SOFT_RING so that affinity assignment
-	 * can be tried again later.
+	 * Logically free the squeue. It goes back to the set of unused
+	 * squeues
 	 */
+	mutex_enter(&sqset_lock);
+	ip_squeue_set_move(sqp, sqset_global_list[0]);
+	mutex_exit(&sqset_lock);
+
 	mutex_enter(&ill->ill_lock);
-	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
+	rx_ring->rr_ring_state = RR_FREE;
 	mutex_exit(&ill->ill_lock);
-	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
-
-out:
-	ip_input(ill, NULL, mp_chain, mhip);
 }
 
-static squeue_t *
-ip_find_unused_squeue(squeue_set_t *sqs, boolean_t fanout)
+/*
+ * Stop the squeue from polling. This needs to be done
+ * from inside the perimeter.
+ */
+void
+ip_squeue_quiesce_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
 {
-	int 		i;
-	squeue_set_t	*best_sqs = NULL;
-	squeue_set_t	*curr_sqs = NULL;
-	int		min_sq = 0;
-	squeue_t 	*sqp = NULL;
-	char		sqname[64];
-	cpu_t		*bind_cpu;
-
-	/*
-	 * If fanout is set and the passed squeue_set already has some
-	 * squeues which are managing the NICs, try to find squeues on
-	 * unused CPU.
-	 */
-	if (sqs->sqs_size > 1 && fanout) {
-		/*
-		 * First check to see if any squeue on the CPU passed
-		 * is managing a NIC.
-		 */
-		mutex_enter(&sqs->sqs_lock);
-		for (i = 0; i < sqs->sqs_size; i++) {
-			mutex_enter(&sqs->sqs_list[i]->sq_lock);
-			if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
-			    !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
-				mutex_exit(&sqs->sqs_list[i]->sq_lock);
-				break;
-			}
-			mutex_exit(&sqs->sqs_list[i]->sq_lock);
-		}
-		mutex_exit(&sqs->sqs_lock);
-		if (i != sqs->sqs_size) {
-			best_sqs = NULL;
-
-			for (i = sqset_global_size - 1; i >= 0; i--) {
-				curr_sqs = sqset_global_list[i];
-				/*
-				 * Check and make sure the CPU that sqs
-				 * is bound to is valid. There could be
-				 * sqs's around whose CPUs could have
-				 * been DR'd out.
-				 */
-				mutex_enter(&cpu_lock);
-				if (cpu_get(curr_sqs->sqs_bind) != NULL) {
-					if (best_sqs == NULL) {
-						best_sqs = curr_sqs;
-						min_sq = curr_sqs->sqs_size;
-					} else if (curr_sqs->sqs_size <
-					    min_sq) {
-						best_sqs = curr_sqs;
-						min_sq = curr_sqs->sqs_size;
-					}
-				}
-				mutex_exit(&cpu_lock);
-			}
-
-			ASSERT(best_sqs != NULL);
-			sqs = best_sqs;
-		}
-	}
+	squeue_t *sqp;
 
-	mutex_enter(&sqs->sqs_lock);
+	ASSERT(ILL_MAC_PERIM_HELD(ill));
+	ASSERT(rx_ring != NULL);
 
-	for (i = 0; i < sqs->sqs_size; i++) {
-		mutex_enter(&sqs->sqs_list[i]->sq_lock);
-		if ((sqs->sqs_list[i]->sq_state &
-		    (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
-			sqp = sqs->sqs_list[i];
-			break;
-		}
-		mutex_exit(&sqs->sqs_list[i]->sq_lock);
-	}
+	sqp = rx_ring->rr_sqp;
+	mutex_enter(&sqp->sq_lock);
+	sqp->sq_state |= SQS_POLL_QUIESCE;
+	cv_signal(&sqp->sq_worker_cv);
+	while (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE))
+		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
 
-	if (sqp == NULL) {
-		/* Need to create a new squeue */
-		if (sqs->sqs_size == sqs->sqs_max_size) {
-			/*
-			 * Reached the max limit for squeue
-			 * we can allocate on this CPU.
-			 */
-			mutex_exit(&sqs->sqs_lock);
-			return (NULL);
-		}
+	mutex_exit(&sqp->sq_lock);
+}
 
-		mutex_enter(&cpu_lock);
-		if ((bind_cpu = cpu_get(sqs->sqs_bind)) == NULL) {
-			/* Too bad, CPU got DR'd out, return NULL */
-			mutex_exit(&cpu_lock);
-			mutex_exit(&sqs->sqs_lock);
-			return (NULL);
-		}
+/*
+ * Restart polling etc. Needs to be inside the perimeter to
+ * prevent races.
+ */
+void
+ip_squeue_restart_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
+{
+	squeue_t *sqp;
 
-		bzero(sqname, sizeof (sqname));
-		(void) snprintf(sqname, sizeof (sqname),
-		    "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
-		    bind_cpu->cpu_id, sqs->sqs_size);
-		mutex_exit(&cpu_lock);
+	ASSERT(ILL_MAC_PERIM_HELD(ill));
+	ASSERT(rx_ring != NULL);
 
-		sqp = squeue_create(sqname, sqs->sqs_bind,
-		    ip_squeue_worker_wait, minclsyspri);
+	sqp = rx_ring->rr_sqp;
+	mutex_enter(&sqp->sq_lock);
+	/*
+	 * Handle change in number of rings between the quiesce and
+	 * restart operations by checking for a previous quiesce before
+	 * attempting a restart.
+	 */
+	if (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) {
+		mutex_exit(&sqp->sq_lock);
+		return;
+	}
+	sqp->sq_state |= SQS_POLL_RESTART;
+	cv_signal(&sqp->sq_worker_cv);
+	while (!(sqp->sq_state & SQS_POLL_RESTART_DONE))
+		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
+	sqp->sq_state &= ~SQS_POLL_RESTART_DONE;
+	mutex_exit(&sqp->sq_lock);
+}
 
-		ASSERT(sqp != NULL);
+/*
+ * sanitize all squeues associated with the ill.
+ */
+void
+ip_squeue_clean_all(ill_t *ill)
+{
+	int idx;
+	ill_rx_ring_t	*rx_ring;
 
-		squeue_profile_enable(sqp);
-		/*
-		 * Other functions scanning sqs_list don't take sqs_lock.
-		 * Once sqp is stored in sqs_list[] global visibility is
-		 * ensured before incrementing the sqs_size counter.
-		 */
-		sqs->sqs_list[sqs->sqs_size] = sqp;
-		membar_producer();
-		sqs->sqs_size++;
-
-		if (ip_squeue_create_callback != NULL)
-			ip_squeue_create_callback(sqp);
-
-		if (ip_squeue_bind) {
-			mutex_enter(&cpu_lock);
-			bind_cpu = cpu_get(sqs->sqs_bind);
-			if (bind_cpu != NULL && cpu_is_online(bind_cpu)) {
-				squeue_bind(sqp, -1);
-			}
-			mutex_exit(&cpu_lock);
-		}
-		mutex_enter(&sqp->sq_lock);
+	for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
+		rx_ring = &ill->ill_dld_capab->idc_poll.idp_ring_tbl[idx];
+		ip_squeue_clean_ring(ill, rx_ring);
 	}
-
-	mutex_exit(&sqs->sqs_lock);
-	ASSERT(sqp != NULL);
-	return (sqp);
 }
 
 /*
- * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
- * owned by a squeue yet, do the assignment. When the NIC registers it
- * Rx rings with IP, we don't know where the interrupts will land and
- * hence we need to wait till this point to do the assignment.
+ * Used by IP to get the squeue associated with a ring. If the squeue isn't
+ * yet bound to a CPU, and we're being called directly from the NIC's
+ * interrupt, then we know what CPU we want to assign the squeue to, so
+ * dispatch that task to a taskq.
  */
 squeue_t *
 ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
 {
 	squeue_t 	*sqp;
-	ill_t 		*ill;
-	int		interrupt;
-	ip_taskq_arg_t	*taskq_arg;
-	boolean_t	refheld;
-
-	if (ill_rx_ring == NULL)
-		return (IP_SQUEUE_GET(lbolt));
-
-	sqp = ill_rx_ring->rr_sqp;
-	/*
-	 * Do a quick check. If it's not NULL, we are done.
-	 * Squeues are never destroyed so worse we will bind
-	 * this connection to a suboptimal squeue.
-	 *
-	 * This is the fast path case.
-	 */
-	if (sqp != NULL)
-		return (sqp);
-
-	ill = ill_rx_ring->rr_ill;
-	ASSERT(ill != NULL);
-
-	interrupt = servicing_interrupt();
-	taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
-	    KM_NOSLEEP);
 
-	mutex_enter(&ill->ill_lock);
-	/*
-	 * Check sqp under the lock again for atomicity. Possible race with
-	 * a previously scheduled ip_squeue_get -> ip_squeue_extend.
-	 * Do the ring to squeue binding only if we are in interrupt context
-	 * AND the ring is not already bound AND there is no one else trying
-	 * the bind already.
-	 */
-	sqp = ill_rx_ring->rr_sqp;
-	if (sqp != NULL || !interrupt ||
-	    ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) {
-		/*
-		 * Note that the ring might get bound once we drop the lock
-		 * below, if a previous request is in progress i.e. if the ring
-		 * state is ILL_RING_INPROC. The incoming connection on whose
-		 * behalf we are currently here might get a suboptimal squeue
-		 * via the call to IP_SQUEUE_GET below, but there is no
-		 * correctness issue.
-		 */
-		mutex_exit(&ill->ill_lock);
-		if (taskq_arg != NULL)
-			kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
-		if (sqp != NULL)
-			return (sqp);
+	if ((ill_rx_ring == NULL) || ((sqp = ill_rx_ring->rr_sqp) == NULL))
 		return (IP_SQUEUE_GET(lbolt));
-	}
-
-	/*
-	 * No sqp assigned yet. Can't really do that in interrupt
-	 * context. Assign the default sqp to this connection and
-	 * trigger creation of new sqp and binding it to this ring
-	 * via taskq. Need to make sure ill stays around.
-	 */
-	taskq_arg->ip_taskq_ill = ill;
-	taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
-	taskq_arg->ip_taskq_cpu = CPU;
-	ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
-	mutex_exit(&ill->ill_lock);
-	refheld = ill_waiter_inc(ill);
-	if (refheld) {
-		if (taskq_dispatch(system_taskq, ip_squeue_extend,
-		    taskq_arg, TQ_NOSLEEP) != NULL) {
-			return (IP_SQUEUE_GET(lbolt));
-		}
-	}
-	/*
-	 * The ill is closing and we could not get a reference on the ill OR
-	 * taskq_dispatch failed probably due to memory allocation failure.
-	 * We will try again next time.
-	 */
-	mutex_enter(&ill->ill_lock);
-	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
-	mutex_exit(&ill->ill_lock);
-	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
-	if (refheld)
-		ill_waiter_dcr(ill);
 
-	return (IP_SQUEUE_GET(lbolt));
+	return (sqp);
 }
 
 /*
- * NDD hooks for setting ip_squeue_xxx tuneables.
+ * Called when a CPU goes offline. It's squeue_set_t is destroyed, and all
+ * squeues are unboudn and moved to the unbound set.
  */
-
-/* ARGSUSED */
-int
-ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
-    caddr_t addr, cred_t *cr)
+static void
+ip_squeue_set_destroy(cpu_t *cpu)
 {
-	int *bind_enabled = (int *)addr;
-	long new_value;
 	int i;
+	squeue_t *sqp, *lastsqp = NULL;
+	squeue_set_t *sqs, *unbound = sqset_global_list[0];
 
-	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
-		return (EINVAL);
+	mutex_enter(&sqset_lock);
+	if ((sqs = cpu->cpu_squeue_set) == NULL) {
+		mutex_exit(&sqset_lock);
+		return;
+	}
 
-	if (ip_squeue_bind == new_value)
-		return (0);
+	/* Move all squeues to unbound set */
 
-	*bind_enabled = new_value;
-	mutex_enter(&cpu_lock);
-	if (new_value == 0) {
-		for (i = 0; i < sqset_global_size; i++)
-			ip_squeue_set_unbind(sqset_global_list[i]);
-	} else {
-		for (i = 0; i < sqset_global_size; i++)
-			ip_squeue_set_bind(sqset_global_list[i]);
+	for (sqp = sqs->sqs_head; sqp; lastsqp = sqp, sqp = sqp->sq_next) {
+		squeue_unbind(sqp);
+		sqp->sq_set = unbound;
+	}
+	if (sqs->sqs_head) {
+		lastsqp->sq_next = unbound->sqs_head;
+		unbound->sqs_head = sqs->sqs_head;
 	}
 
-	mutex_exit(&cpu_lock);
-	return (0);
-}
+	/* Also move default squeue to unbound set */
 
-/*
- * Set squeue profiling.
- * 0 means "disable"
- * 1 means "enable"
- * 2 means "enable and reset"
- */
-/* ARGSUSED */
-int
-ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
-    cred_t *cr)
-{
-	int *profile_enabled = (int *)cp;
-	long new_value;
-	squeue_set_t *sqs;
-
-	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
-		return (EINVAL);
-
-	if (new_value == 0)
-		squeue_profile_stop();
-	else if (new_value == 1)
-		squeue_profile_start();
-	else if (new_value == 2) {
-		int i, j;
-
-		squeue_profile_stop();
-		mutex_enter(&cpu_lock);
-		for (i = 0; i < sqset_global_size; i++) {
-			sqs = sqset_global_list[i];
-			for (j = 0; j < sqs->sqs_size; j++) {
-				squeue_profile_reset(sqs->sqs_list[j]);
-			}
-		}
-		mutex_exit(&cpu_lock);
+	sqp = sqs->sqs_default;
+	ASSERT(sqp);
+	ASSERT((sqp->sq_state & (SQS_DEFAULT|SQS_ILL_BOUND)) == SQS_DEFAULT);
 
-		new_value = 1;
-		squeue_profile_start();
-	}
-	*profile_enabled = new_value;
+	sqp->sq_next = unbound->sqs_head;
+	unbound->sqs_head = sqp;
+	squeue_unbind(sqp);
+	sqp->sq_set = unbound;
 
-	return (0);
+	for (i = 1; i < sqset_global_size; i++)
+		if (sqset_global_list[i] == sqs)
+			break;
+
+	ASSERT(i < sqset_global_size);
+	sqset_global_list[i] = sqset_global_list[sqset_global_size - 1];
+	sqset_global_list[sqset_global_size - 1] = NULL;
+	sqset_global_size--;
+
+	mutex_exit(&sqset_lock);
+	kmem_free(sqs, sizeof (*sqs));
 }
 
 /*
  * Reconfiguration callback
  */
-
 /* ARGSUSED */
 static int
 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
 {
-	cpu_t *cp = cpu[id];
+	cpu_t *cp = cpu_get(id);
 
 	ASSERT(MUTEX_HELD(&cpu_lock));
 	switch (what) {
 	case CPU_CONFIG:
-		/*
-		 * A new CPU is added. Create an squeue for it but do not bind
-		 * it yet.
-		 */
-		if (cp->cpu_squeue_set == NULL)
-			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
-		break;
 	case CPU_ON:
 	case CPU_INIT:
 	case CPU_CPUPART_IN:
-		if (cp->cpu_squeue_set == NULL) {
-			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
-		}
-		if (ip_squeue_bind)
-			ip_squeue_set_bind(cp->cpu_squeue_set);
+		if (cp->cpu_squeue_set == NULL)
+			cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
 		break;
 	case CPU_UNCONFIG:
 	case CPU_OFF:
 	case CPU_CPUPART_OUT:
 		ASSERT((cp->cpu_squeue_set != NULL) ||
 		    (cp->cpu_flags & CPU_OFFLINE));
-
 		if (cp->cpu_squeue_set != NULL) {
-			ip_squeue_set_unbind(cp->cpu_squeue_set);
+			ip_squeue_set_destroy(cp);
+			cp->cpu_squeue_set = NULL;
 		}
 		break;
 	default:
@@ -1111,54 +756,3 @@ ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
 	}
 	return (0);
 }
-
-/* ARGSUSED */
-static void
-ip_squeue_set_bind(squeue_set_t *sqs)
-{
-	int i;
-	squeue_t *sqp;
-
-	if (!ip_squeue_bind)
-		return;
-
-	mutex_enter(&sqs->sqs_lock);
-	for (i = 0; i < sqs->sqs_size; i++) {
-		sqp = sqs->sqs_list[i];
-		if (sqp->sq_state & SQS_BOUND)
-			continue;
-		squeue_bind(sqp, -1);
-	}
-	mutex_exit(&sqs->sqs_lock);
-}
-
-static void
-ip_squeue_set_unbind(squeue_set_t *sqs)
-{
-	int i;
-	squeue_t *sqp;
-
-	mutex_enter(&sqs->sqs_lock);
-	for (i = 0; i < sqs->sqs_size; i++) {
-		sqp = sqs->sqs_list[i];
-
-		/*
-		 * CPU is going offline. Remove the thread affinity
-		 * for any soft ring threads the squeue is managing.
-		 */
-		if (sqp->sq_state & SQS_ILL_BOUND) {
-			ill_rx_ring_t	*ring = sqp->sq_rx_ring;
-			ill_t		*ill = ring->rr_ill;
-
-			if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
-				ASSERT(ring->rr_handle != NULL);
-				ill->ill_dls_capab->ill_dls_unbind(
-				    ring->rr_handle);
-			}
-		}
-		if (!(sqp->sq_state & SQS_BOUND))
-			continue;
-		squeue_unbind(sqp);
-	}
-	mutex_exit(&sqs->sqs_lock);
-}
diff --git a/usr/src/uts/common/inet/ip/spd.c b/usr/src/uts/common/inet/ip/spd.c
index 7274576285..f785d8a3f6 100644
--- a/usr/src/uts/common/inet/ip/spd.c
+++ b/usr/src/uts/common/inet/ip/spd.c
@@ -176,13 +176,6 @@ int ipsec_weird_null_inbound_policy = 0;
 	    (((sa1)->ipsa_dst_cid == (sa2)->ipsa_dst_cid))))
 
 /*
- * IPv4 Fragments
- */
-#define	IS_V4_FRAGMENT(ipha_fragment_offset_and_flags)			\
-	(((ntohs(ipha_fragment_offset_and_flags) & IPH_OFFSET) != 0) ||	\
-	((ntohs(ipha_fragment_offset_and_flags) & IPH_MF) != 0))
-
-/*
  * IPv6 Fragments
  */
 #define	IS_V6_FRAGMENT(ipp)	(ipp.ipp_fields & IPPF_FRAGHDR)
diff --git a/usr/src/uts/common/inet/ip/tun.c b/usr/src/uts/common/inet/ip/tun.c
index 24af532b77..632601b5f1 100644
--- a/usr/src/uts/common/inet/ip/tun.c
+++ b/usr/src/uts/common/inet/ip/tun.c
@@ -3202,7 +3202,7 @@ tun_rdata_v4(queue_t *q, mblk_t *ipsec_mp, mblk_t *data_mp, tun_t *atp)
 	 */
 	pullup_len = hdrlen + (inner_v4 ? sizeof (ipha_t) : sizeof (ip6_t)) + 4;
 	if ((data_mp->b_wptr - data_mp->b_rptr) < pullup_len) {
-		if (!pullupmsg(data_mp, hdrlen + pullup_len)) {
+		if (!pullupmsg(data_mp, pullup_len)) {
 			atomic_add_32(&atp->tun_InErrors, 1);
 			atomic_add_32(&atp->tun_InDiscard, 1);
 			if (ipsec_mp != NULL)
diff --git a/usr/src/uts/common/inet/ip_ftable.h b/usr/src/uts/common/inet/ip_ftable.h
index e729761147..6a3a05183b 100644
--- a/usr/src/uts/common/inet/ip_ftable.h
+++ b/usr/src/uts/common/inet/ip_ftable.h
@@ -27,8 +27,6 @@
 #ifndef _INET_IP_FTABLE_H
 #define	_INET_IP_FTABLE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -94,6 +92,8 @@ extern	void ire_delete_host_redirects(ipaddr_t, ip_stack_t *);
 extern	ire_t *ire_ihandle_lookup_onlink(ire_t *);
 extern	ire_t *ire_forward(ipaddr_t, enum ire_forward_action *, ire_t *,
     ire_t *, const struct ts_label_s *, ip_stack_t *);
+extern	ire_t *ire_forward_simple(ipaddr_t, enum ire_forward_action *,
+    ip_stack_t *);
 extern irb_t	*ire_get_bucket(ire_t *);
 extern uint_t ifindex_lookup(const struct sockaddr *, zoneid_t);
 extern int ipfil_sendpkt(const struct sockaddr *, mblk_t *, uint_t, zoneid_t);
diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h
index 1bd5b47a9f..c0a6c51696 100644
--- a/usr/src/uts/common/inet/ip_if.h
+++ b/usr/src/uts/common/inet/ip_if.h
@@ -142,6 +142,12 @@ extern "C" {
 #define	RESTRICT_TO_GROUP	0x1	/* Restrict to IPMP group */
 #define	RESTRICT_TO_ILL		0x2	/* Restrict to ILL */
 
+#ifdef DEBUG
+#define	ILL_MAC_PERIM_HELD(ill)	ill_mac_perim_held(ill)
+#else
+#define	ILL_MAC_PERIM_HELD(ill)
+#endif
+
 /* for ipif_resolver_up */
 enum ip_resolver_action {
 	Res_act_initial,		/* initial address establishment */
@@ -158,6 +164,7 @@ extern	void	ill_dlpi_done(ill_t *, t_uscalar_t);
 extern	boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t);
 extern	void	ill_dlpi_send(ill_t *, mblk_t *);
 extern	void	ill_dlpi_send_deferred(ill_t *);
+extern	void	ill_capability_done(ill_t *);
 
 extern	mblk_t	*ill_dlur_gen(uchar_t *, uint_t, t_uscalar_t, t_scalar_t);
 extern  ill_t	*ill_group_lookup_on_ifindex(uint_t, boolean_t, ip_stack_t *);
@@ -208,9 +215,12 @@ extern	void	ill_untrace_ref(ill_t *);
 extern	boolean_t ill_down_start(queue_t *, mblk_t *);
 extern	ill_t	*ill_lookup_group_v6(const in6_addr_t *, zoneid_t,
     ip_stack_t *);
+
 extern	void	ill_capability_ack(ill_t *, mblk_t *);
 extern	void	ill_capability_probe(ill_t *);
-extern	void	ill_capability_reset(ill_t *);
+extern	void	ill_capability_reset(ill_t *, boolean_t);
+extern	void	ill_taskq_dispatch(ip_stack_t *);
+
 extern	void	ill_mtu_change(ire_t *, char *);
 extern void	ill_group_cleanup(ill_t *);
 extern int	ill_up_ipifs(ill_t *, queue_t *, mblk_t *);
@@ -281,10 +291,11 @@ extern	void	ipsq_current_start(ipsq_t *, ipif_t *, int);
 extern	void	ipsq_current_finish(ipsq_t *);
 extern	void	ipsq_enq(ipsq_t *, queue_t *, mblk_t *, ipsq_func_t, int,
     ill_t *);
-extern	boolean_t ipsq_enter(ill_t *, boolean_t);
+extern	boolean_t ipsq_enter(ill_t *, boolean_t, int);
 extern	ipsq_t	*ipsq_try_enter(ipif_t *, ill_t *, queue_t *, mblk_t *,
     ipsq_func_t, int, boolean_t);
 extern	void	ipsq_exit(ipsq_t *);
+extern	boolean_t ill_mac_perim_held(ill_t *);
 extern mblk_t	*ipsq_pending_mp_get(ipsq_t *, conn_t **);
 extern boolean_t ipsq_pending_mp_add(conn_t *, ipif_t *, queue_t *,
     mblk_t *, int);
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index d993e5f6b4..f7a9b8ff58 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -40,6 +40,7 @@ extern "C" {
 #ifdef _KERNEL
 
 #include <sys/sdt.h>
+#include <sys/dld.h>
 
 #define	IP_MOD_ID		5701
 
@@ -359,7 +360,7 @@ typedef struct ip_mdt_info_s {
 	ill->ill_mdt_capab->ill_mdt_on != 0)
 
 #define	ILL_LSO_CAPABLE(ill)		\
-	(((ill)->ill_capabilities & ILL_CAPAB_LSO) != 0)
+	(((ill)->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0)
 
 /*
  * ioctl identifier and structure for Large Segment Offload
@@ -378,12 +379,11 @@ typedef struct ip_lso_info_s {
 #define	ILL_LSO_USABLE(ill)						\
 	(ILL_LSO_CAPABLE(ill) &&					\
 	ill->ill_lso_capab != NULL &&					\
-	ill->ill_lso_capab->ill_lso_version == LSO_VERSION_1 &&		\
 	ill->ill_lso_capab->ill_lso_on != 0)
 
 #define	ILL_LSO_TCP_USABLE(ill)						\
 	(ILL_LSO_USABLE(ill) &&						\
-	ill->ill_lso_capab->ill_lso_flags & LSO_TX_BASIC_TCP_IPV4)
+	ill->ill_lso_capab->ill_lso_flags & DLD_LSO_TX_BASIC_TCP_IPV4)
 
 /*
  * Macro that determines whether or not a given CONN may be considered
@@ -497,43 +497,36 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2)	ip_pdescinfo_t;
 	(connp)->conn_udp->udp_drain_qfull :				\
 	!canputnext((connp)->conn_rq))
 
-#define	ILL_DLS_CAPABLE(ill)	\
-	(((ill)->ill_capabilities &		\
-	(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) != 0)
-
-/*
- * Macro that hands off one or more messages directly to DLD
- * when the interface is marked with ILL_CAPAB_POLL.
- */
-#define	IP_DLS_ILL_TX(ill, ipha, mp, ipst, hlen) {			\
-	ill_dls_capab_t *ill_dls = ill->ill_dls_capab;			\
-	ASSERT(ILL_DLS_CAPABLE(ill));					\
-	ASSERT(ill_dls != NULL);					\
-	ASSERT(ill_dls->ill_tx != NULL);				\
-	ASSERT(ill_dls->ill_tx_handle != NULL);				\
-	DTRACE_PROBE4(ip4__physical__out__start,			\
-	    ill_t *, NULL, ill_t *, ill,				\
-	    ipha_t *, ipha, mblk_t *, mp);				\
-	FW_HOOKS(ipst->ips_ip4_physical_out_event,			\
-	    ipst->ips_ipv4firewall_physical_out,			\
-	    NULL, ill, ipha, mp, mp, 0, ipst);				\
-	DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);		\
-	if (mp != NULL) {						\
-		if (ipst->ips_ipobs_enabled) {				\
-			zoneid_t szone;					\
-									\
-			szone = ip_get_zoneid_v4(ipha->ipha_src, mp,	\
-			    ipst, ALL_ZONES);				\
-			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,	\
-			    ALL_ZONES, ill, IPV4_VERSION, hlen, ipst);	\
-		}							\
-		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,		\
-		    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,	\
-		    ipha_t *, ipha, ip6_t *, NULL, int,	0);		\
-		ill_dls->ill_tx(ill_dls->ill_tx_handle, mp);		\
-	}								\
+/* Macro that follows definitions of flags for mac_tx() (see mac_client.h) */
+#define	IP_DROP_ON_NO_DESC	0x01	/* Equivalent to MAC_DROP_ON_NO_DESC */
+
+#define	ILL_DIRECT_CAPABLE(ill)						\
+	(((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0)
+
+#define	ILL_SEND_TX(ill, ire, hint, mp, flag) {			\
+	if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) {	\
+		ill_dld_direct_t *idd;				\
+								\
+		idd = &(ill)->ill_dld_capab->idc_direct;	\
+		/*						\
+		 * Send the packet directly to DLD, where it	\
+		 * may be queued depending on the availability	\
+		 * of transmit resources at the media layer.	\
+		 * Ignore the returned value for the time being \
+		 * In future, we may want to take this into	\
+		 * account and flow control the TCP.		\
+		 */						\
+		(void) idd->idd_tx_df(idd->idd_tx_dh, mp,	\
+		    (uintptr_t)(hint), flag);			\
+	} else {						\
+		putnext((ire)->ire_stq, mp);			\
+	}							\
 }
 
+#define	MBLK_RX_FANOUT_SLOWPATH(mp, ipha)				\
+	(DB_TYPE(mp) != M_DATA || DB_REF(mp) != 1 || !OK_32PTR(ipha) || \
+	(((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH) >= (mp)->b_wptr))
+
 /*
  * In non-global zone exclusive IP stacks, data structures such as IRE
  * entries pretend that they're in the global zone.  The following
@@ -548,6 +541,7 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2)	ip_pdescinfo_t;
 extern int	ip_wput_frag_mdt_min;
 extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t);
 extern mblk_t   *ip_prepend_zoneid(mblk_t *, zoneid_t, ip_stack_t *);
+extern void ill_flow_enable(void *, ip_mac_tx_cookie_t);
 extern zoneid_t	ip_get_zoneid_v4(ipaddr_t, mblk_t *, ip_stack_t *, zoneid_t);
 extern zoneid_t	ip_get_zoneid_v6(in6_addr_t *, mblk_t *, const ill_t *,
     ip_stack_t *, zoneid_t);
diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h
index c9a0e12ea1..7accbbcfa3 100644
--- a/usr/src/uts/common/inet/ip_ire.h
+++ b/usr/src/uts/common/inet/ip_ire.h
@@ -235,6 +235,7 @@ extern	void	ire_atomic_end(irb_t *irb_ptr, ire_t *ire);
 extern	void	ire_cache_count(ire_t *, char *);
 extern	ire_t	*ire_cache_lookup(ipaddr_t, zoneid_t,
     const struct ts_label_s *, ip_stack_t *);
+extern	ire_t	*ire_cache_lookup_simple(ipaddr_t, ip_stack_t *);
 extern	ire_t	*ire_cache_lookup_v6(const in6_addr_t *, zoneid_t,
     const struct ts_label_s *, ip_stack_t *);
 extern	void	ire_cache_reclaim(ire_t *, char *);
diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h
index b788b95fa0..d0c3953374 100644
--- a/usr/src/uts/common/inet/ip_stack.h
+++ b/usr/src/uts/common/inet/ip_stack.h
@@ -35,7 +35,7 @@ extern "C" {
 #include <netinet/igmp_var.h>
 
 #ifdef _KERNEL
-
+#include <sys/list.h>
 
 /*
  * IP statistics.
@@ -175,6 +175,13 @@ struct ip_stack {
 	struct ill_group *ips_illgrp_head_v4;	/* Head of IPv4 ill groups */
 	struct ill_group *ips_illgrp_head_v6;	/* Head of IPv6 ill groups */
 
+	/* Taskq dispatcher for capability operations */
+	kmutex_t	ips_capab_taskq_lock;
+	kcondvar_t	ips_capab_taskq_cv;
+	list_t		ips_capab_taskq_list;
+	kthread_t	*ips_capab_taskq_thread;
+	boolean_t	ips_capab_taskq_quit;
+
 /* ipclassifier.c - keep in ip_stack_t */
 	/* ipclassifier hash tables */
 	struct connf_s	*ips_rts_clients;
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index dac6d023f7..4665549c69 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -26,8 +26,6 @@
 #ifndef	_INET_IPCLASSIFIER_H
 #define	_INET_IPCLASSIFIER_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -222,10 +220,13 @@ struct conn_s {
 		conn_recvslla : 1,		/* IP_RECVSLLA option */
 		conn_mdt_ok : 1,		/* MDT is permitted */
 		conn_nexthop_set : 1,
-		conn_allzones : 1,		/* SO_ALLZONES */
+		conn_allzones : 1;		/* SO_ALLZONES */
 
+	unsigned int
 		conn_lso_ok : 1;		/* LSO is usable */
 
+	squeue_t	*conn_initial_sqp;	/* Squeue at open time */
+	squeue_t	*conn_final_sqp;	/* Squeue after connect */
 	ill_t		*conn_nofailover_ill;	/* Failover ill */
 	ill_t		*conn_dhcpinit_ill;	/* IP_DHCPINIT_IF */
 	ipsec_latch_t	*conn_latch;		/* latched state */
@@ -286,8 +287,8 @@ struct conn_s {
 	int		conn_orig_bound_ifindex; /* BOUND_IF before MOVE */
 	int		conn_orig_multicast_ifindex;
 						/* IPv6 MC IF before MOVE */
-	struct conn_s 	*conn_drain_next;	/* Next conn in drain list */
-	struct conn_s	*conn_drain_prev;	/* Prev conn in drain list */
+	struct	conn_s	*conn_drain_next;	/* Next conn in drain list */
+	struct	conn_s	*conn_drain_prev;	/* Prev conn in drain list */
 	idl_t		*conn_idl;		/* Ptr to the drain list head */
 	mblk_t		*conn_ipsec_opt_mp;	/* ipsec option mblk */
 	uint32_t	conn_src_preferences;	/* prefs for src addr select */
@@ -499,6 +500,7 @@ struct connf_s {
 	(connp)->conn_ports = ports;					\
 	(connp)->conn_send = ip_output;					\
 	(connp)->conn_sqp = IP_SQUEUE_GET(lbolt);			\
+	(connp)->conn_initial_sqp = (connp)->conn_sqp;			\
 }
 
 #define	IPCL_TCP_EAGER_INIT_V6(connp, protocol, src, rem, ports) {	\
@@ -508,6 +510,7 @@ struct connf_s {
 	(connp)->conn_ports = ports;					\
 	(connp)->conn_send = ip_output_v6;				\
 	(connp)->conn_sqp = IP_SQUEUE_GET(lbolt);			\
+	(connp)->conn_initial_sqp = (connp)->conn_sqp;			\
 }
 
 #define	IPCL_UDP_HASH(lport, ipst)	\
diff --git a/usr/src/uts/common/inet/ipdrop.h b/usr/src/uts/common/inet/ipdrop.h
index 88dcda264c..9fe672434e 100644
--- a/usr/src/uts/common/inet/ipdrop.h
+++ b/usr/src/uts/common/inet/ipdrop.h
@@ -124,7 +124,6 @@ struct ip_dropstats {
 };
 
 #endif /* _KERNEL */
-
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 4895e2249e..559abd9178 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -19,144 +19,95 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
- * Squeues - TCP/IP serialization mechanism.
- *
- * This is a general purpose high-performance serialization mechanism. It is
- * similar to a taskq with a single worker thread, the difference is that it
- * does not imply a context switch - the thread placing a request may actually
- * process it. It is also biased for processing requests in interrupt context.
- *
- * Each squeue has a worker thread which may optionally be bound to a CPU.
- *
- * Only one thread may process requests from a given squeue at any time. This is
- * called "entering" squeue.
- *
- * Each dispatched request is processed either by
- *
- *	a) Dispatching thread or
- *	b) Some other thread that is currently processing squeue at the time of
- *		request or
- *	c) worker thread.
- *
- * INTERFACES:
- *
- * squeue_t *squeue_create(name, bind, wait, pri)
- *
- *	name: symbolic name for squeue.
- *	wait: time to wait before waiking the worker thread after queueing
- *		request.
- *	bind: preferred CPU binding for the worker thread.
- *	pri:  thread priority for the worker thread.
- *
- *   This function never fails and may sleep. It returns a transparent pointer
- *   to the squeue_t structure that is passed to all other squeue operations.
- *
- * void squeue_bind(sqp, bind)
- *
- *   Bind squeue worker thread to a CPU specified by the 'bind' argument. The
- *   'bind' value of -1 binds to the preferred thread specified for
- *   squeue_create.
- *
- *   NOTE: Any value of 'bind' other then -1 is not supported currently, but the
- *	 API is present - in the future it may be useful to specify different
- *	 binding.
- *
- * void squeue_unbind(sqp)
- *
- *   Unbind the worker thread from its preferred CPU.
- *
- * void squeue_enter(*sqp, *mp, proc, arg, tag)
- *
- *   Post a single request for processing. Each request consists of mblock 'mp',
- *   function 'proc' to execute and an argument 'arg' to pass to this
- *   function. The function is called as (*proc)(arg, mp, sqp); The tag is an
- *   arbitrary number from 0 to 255 which will be stored in mp to track exact
- *   caller of squeue_enter. The combination of function name and the tag should
- *   provide enough information to identify the caller.
- *
- *   If no one is processing the squeue, squeue_enter() will call the function
- *   immediately. Otherwise it will add the request to the queue for later
- *   processing. Once the function is executed, the thread may continue
- *   executing all other requests pending on the queue.
+ * Squeues: General purpose serialization mechanism
+ * ------------------------------------------------
  *
- *   NOTE: The tagging information is only used when SQUEUE_DEBUG is set to 1.
- *   NOTE: The argument can be conn_t only. Ideally we'd like to have generic
- *	   argument, but we want to drop connection reference count here - this
- *	   improves tail-call optimizations.
- *	   XXX: The arg should have type conn_t.
+ * Background:
+ * -----------
  *
- * void squeue_enter_nodrain(*sqp, *mp, proc, arg, tag)
+ * This is a general purpose high-performance serialization mechanism
+ * currently used by TCP/IP. It is implement by means of a per CPU queue,
+ * a worker thread and a polling thread with are bound to the CPU
+ * associated with the squeue. The squeue is strictly FIFO for both read
+ * and write side and only one thread can process it at any given time.
+ * The design goal of squeue was to offer a very high degree of
+ * parallelization (on a per H/W execution pipeline basis) with at
+ * most one queuing.
  *
- *   Same as squeue_enter(), but the entering thread will only try to execute a
- *   single request. It will not continue executing any pending requests.
+ * The modules needing protection typically calls squeue_enter() or
+ * squeue_enter_chain() routine as soon as a thread enter the module
+ * from either direction. For each packet, the processing function
+ * and argument is stored in the mblk itself. When the packet is ready
+ * to be processed, the squeue retrieves the stored function and calls
+ * it with the supplied argument and the pointer to the packet itself.
+ * The called function can assume that no other thread is processing
+ * the squeue when it is executing.
  *
- * void squeue_fill(*sqp, *mp, proc, arg, tag)
+ * Squeue/connection binding:
+ * --------------------------
  *
- *   Just place the request on the queue without trying to execute it. Arrange
- *   for the worker thread to process the request.
+ * TCP/IP uses an IP classifier in conjunction with squeue where specific
+ * connections are assigned to specific squeue (based on various policies),
+ * at the connection creation time. Once assigned, the connection to
+ * squeue mapping is never changed and all future packets for that
+ * connection are processed on that squeue. The connection ("conn") to
+ * squeue mapping is stored in "conn_t" member "conn_sqp".
  *
- * void squeue_profile_enable(sqp)
- * void squeue_profile_disable(sqp)
+ * Since the processing of the connection cuts across multiple layers
+ * but still allows packets for different connnection to be processed on
+ * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or
+ * "Per Connection Vertical Perimeter".
  *
- *    Enable or disable profiling for specified 'sqp'. Profiling is only
- *    available when SQUEUE_PROFILE is set.
+ * Processing Model:
+ * -----------------
  *
- * void squeue_profile_reset(sqp)
+ * Squeue doesn't necessary processes packets with its own worker thread.
+ * The callers can pick if they just want to queue the packet, process
+ * their packet if nothing is queued or drain and process. The first two
+ * modes are typically employed when the packet was generated while
+ * already doing the processing behind the squeue and last mode (drain
+ * and process) is typically employed when the thread is entering squeue
+ * for the first time. The squeue still imposes a finite time limit
+ * for which a external thread can do processing after which it switches
+ * processing to its own worker thread.
  *
- *    Reset all profiling information to zero. Profiling is only
- *    available when SQUEUE_PROFILE is set.
+ * Once created, squeues are never deleted. Hence squeue pointers are
+ * always valid. This means that functions outside the squeue can still
+ * refer safely to conn_sqp and their is no need for ref counts.
  *
- * void squeue_profile_start()
- * void squeue_profile_stop()
+ * Only a thread executing in the squeue can change the squeue of the
+ * connection. It does so by calling a squeue framework function to do this.
+ * After changing the squeue, the thread must leave the squeue. It must not
+ * continue to execute any code that needs squeue protection.
  *
- *    Globally enable or disabled profiling for all squeues.
+ * The squeue framework, after entering the squeue, checks if the current
+ * squeue matches the conn_sqp. If the check fails, the packet is delivered
+ * to right squeue.
  *
- * uintptr_t *squeue_getprivate(sqp, p)
+ * Polling Model:
+ * --------------
  *
- *    Each squeue keeps small amount of private data space available for various
- *    consumers. Current consumers include TCP and NCA. Other consumers need to
- *    add their private tag to the sqprivate_t enum. The private information is
- *    limited to an uintptr_t value. The squeue has no knowledge of its content
- *    and does not manage it in any way.
+ * Squeues can control the rate of packet arrival into itself from the
+ * NIC or specific Rx ring within a NIC. As part of capability negotiation
+ * between IP and MAC layer, squeue are created for each TCP soft ring
+ * (or TCP Rx ring - to be implemented in future). As part of this
+ * negotiation, squeues get a cookie for underlying soft ring or Rx
+ * ring, a function to turn off incoming packets and a function to call
+ * to poll for packets. This helps schedule the receive side packet
+ * processing so that queue backlog doesn't build up and packet processing
+ * doesn't keep getting disturbed by high priority interrupts. As part
+ * of this mode, as soon as a backlog starts building, squeue turns off
+ * the interrupts and switches to poll mode. In poll mode, when poll
+ * thread goes down to retrieve packets, it retrieves them in the form of
+ * a chain which improves performance even more. As the squeue/softring
+ * system gets more packets, it gets more efficient by switching to
+ * polling more often and dealing with larger packet chains.
  *
- *    The typical use may be a breakdown of data structures per CPU (since
- *    squeues are usually per CPU). See NCA for examples of use.
- *    Currently 'p' may have one legal value SQPRIVATE_TCP.
- *
- * processorid_t squeue_binding(sqp)
- *
- *    Returns the CPU binding for a given squeue.
- *
- * TUNABALES:
- *
- * squeue_intrdrain_ms: Maximum time in ms interrupts spend draining any
- *	squeue. Note that this is approximation - squeues have no control on the
- *	time it takes to process each request. This limit is only checked
- *	between processing individual messages.
- *    Default: 20 ms.
- *
- * squeue_writerdrain_ms: Maximum time in ms non-interrupts spend draining any
- *	squeue. Note that this is approximation - squeues have no control on the
- *	time it takes to process each request. This limit is only checked
- *	between processing individual messages.
- *    Default: 10 ms.
- *
- * squeue_workerdrain_ms: Maximum time in ms worker thread spends draining any
- *	squeue. Note that this is approximation - squeues have no control on the
- *	time it takes to process each request. This limit is only checked
- *	between processing individual messages.
- *    Default: 10 ms.
- *
- * squeue_workerwait_ms: When worker thread is interrupted because workerdrain
- *	expired, how much time to wait before waking worker thread again.
- *    Default: 10 ms.
  */
 
 #include <sys/types.h>
@@ -169,208 +120,30 @@
 #include <sys/callb.h>
 #include <sys/sdt.h>
 #include <sys/ddi.h>
+#include <sys/sunddi.h>
 
 #include <inet/ipclassifier.h>
 #include <inet/udp_impl.h>
 
-/*
- * State flags.
- * Note: The MDB IP module depends on the values of these flags.
- */
-#define	SQS_PROC	0x0001	/* being processed */
-#define	SQS_WORKER	0x0002	/* worker thread */
-#define	SQS_ENTER	0x0004	/* enter thread */
-#define	SQS_FAST	0x0008	/* enter-fast thread */
-#define	SQS_USER	0x0010	/* A non interrupt user */
-#define	SQS_BOUND	0x0020	/* Worker thread is bound */
-#define	SQS_PROFILE	0x0040	/* Enable profiling */
-#define	SQS_REENTER	0x0080	/* Re entered thread */
-#define	SQS_TMO_PROG	0x0100	/* Timeout is being set */
-
 #include <sys/squeue_impl.h>
 
 static void squeue_fire(void *);
 static void squeue_drain(squeue_t *, uint_t, hrtime_t);
 static void squeue_worker(squeue_t *sqp);
-
-#if SQUEUE_PROFILE
-static kmutex_t squeue_kstat_lock;
-static int  squeue_kstat_update(kstat_t *, int);
-#endif
+static void squeue_polling_thread(squeue_t *sqp);
 
 kmem_cache_t *squeue_cache;
 
 #define	SQUEUE_MSEC_TO_NSEC 1000000
 
-int squeue_intrdrain_ms = 20;
-int squeue_writerdrain_ms = 10;
-int squeue_workerdrain_ms = 10;
-int squeue_workerwait_ms = 10;
+int squeue_drain_ms = 20;
+int squeue_workerwait_ms = 0;
 
 /* The values above converted to ticks or nano seconds */
-static int squeue_intrdrain_ns = 0;
-static int squeue_writerdrain_ns = 0;
-static int squeue_workerdrain_ns = 0;
+static int squeue_drain_ns = 0;
 static int squeue_workerwait_tick = 0;
 
-/*
- * The minimum packet queued when worker thread doing the drain triggers
- * polling (if squeue allows it). The choice of 3 is arbitrary. You
- * definitely don't want it to be 1 since that will trigger polling
- * on very low loads as well (ssh seems to do be one such example
- * where packet flow was very low yet somehow 1 packet ended up getting
- * queued and worker thread fires every 10ms and blanking also gets
- * triggered.
- */
-int squeue_worker_poll_min = 3;
-
-#if SQUEUE_PROFILE
-/*
- * Set to B_TRUE to enable profiling.
- */
-static int squeue_profile = B_FALSE;
-#define	SQ_PROFILING(sqp) (squeue_profile && ((sqp)->sq_state & SQS_PROFILE))
-
-#define	SQSTAT(sqp, x) ((sqp)->sq_stats.x++)
-#define	SQDELTA(sqp, x, d) ((sqp)->sq_stats.x += (d))
-
-struct squeue_kstat {
-	kstat_named_t	sq_count;
-	kstat_named_t	sq_max_qlen;
-	kstat_named_t	sq_npackets_worker;
-	kstat_named_t	sq_npackets_intr;
-	kstat_named_t	sq_npackets_other;
-	kstat_named_t	sq_nqueued_intr;
-	kstat_named_t	sq_nqueued_other;
-	kstat_named_t	sq_ndrains_worker;
-	kstat_named_t	sq_ndrains_intr;
-	kstat_named_t	sq_ndrains_other;
-	kstat_named_t	sq_time_worker;
-	kstat_named_t	sq_time_intr;
-	kstat_named_t	sq_time_other;
-} squeue_kstat = {
-	{ "count",		KSTAT_DATA_UINT64 },
-	{ "max_qlen",		KSTAT_DATA_UINT64 },
-	{ "packets_worker",	KSTAT_DATA_UINT64 },
-	{ "packets_intr",	KSTAT_DATA_UINT64 },
-	{ "packets_other",	KSTAT_DATA_UINT64 },
-	{ "queued_intr",	KSTAT_DATA_UINT64 },
-	{ "queued_other",	KSTAT_DATA_UINT64 },
-	{ "ndrains_worker",	KSTAT_DATA_UINT64 },
-	{ "ndrains_intr",	KSTAT_DATA_UINT64 },
-	{ "ndrains_other",	KSTAT_DATA_UINT64 },
-	{ "time_worker",	KSTAT_DATA_UINT64 },
-	{ "time_intr",		KSTAT_DATA_UINT64 },
-	{ "time_other",		KSTAT_DATA_UINT64 },
-};
-#endif
-
-#define	SQUEUE_WORKER_WAKEUP(sqp) {					\
-	timeout_id_t tid = (sqp)->sq_tid;				\
-									\
-	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));				\
-	/*								\
-	 * Queue isn't being processed, so take				\
-	 * any post enqueue actions needed before leaving.		\
-	 */								\
-	if (tid != 0) {							\
-		/*							\
-		 * Waiting for an enter() to process mblk(s).		\
-		 */							\
-		clock_t	waited = lbolt - (sqp)->sq_awaken;		\
-									\
-		if (TICK_TO_MSEC(waited) >= (sqp)->sq_wait) {		\
-			/*						\
-			 * Times up and have a worker thread		\
-			 * waiting for work, so schedule it.		\
-			 */						\
-			(sqp)->sq_tid = 0;				\
-			(sqp)->sq_awaken = lbolt;			\
-			cv_signal(&(sqp)->sq_async);			\
-			mutex_exit(&(sqp)->sq_lock);			\
-			(void) untimeout(tid);				\
-			return;						\
-		}							\
-		mutex_exit(&(sqp)->sq_lock);				\
-		return;							\
-	} else if ((sqp)->sq_state & SQS_TMO_PROG) {			\
-		mutex_exit(&(sqp)->sq_lock);				\
-		return;							\
-	} else if ((sqp)->sq_wait != 0) {				\
-		clock_t	wait = (sqp)->sq_wait;				\
-		/*							\
-		 * Wait up to sqp->sq_wait ms for an			\
-		 * enter() to process this queue. We			\
-		 * don't want to contend on timeout locks		\
-		 * with sq_lock held for performance reasons,		\
-		 * so drop the sq_lock before calling timeout		\
-		 * but we need to check if timeout is required		\
-		 * after re acquiring the sq_lock. Once			\
-		 * the sq_lock is dropped, someone else could		\
-		 * have processed the packet or the timeout could	\
-		 * have already fired.					\
-		 */							\
-		(sqp)->sq_state |= SQS_TMO_PROG;			\
-		mutex_exit(&(sqp)->sq_lock);				\
-		tid = timeout(squeue_fire, (sqp), wait);		\
-		mutex_enter(&(sqp)->sq_lock);				\
-		/* Check again if we still need the timeout */		\
-		if ((((sqp)->sq_state & (SQS_PROC|SQS_TMO_PROG)) ==	\
-			SQS_TMO_PROG) && ((sqp)->sq_tid == 0) &&	\
-			((sqp)->sq_first != NULL)) {			\
-				(sqp)->sq_state &= ~SQS_TMO_PROG;	\
-				(sqp)->sq_awaken = lbolt;		\
-				(sqp)->sq_tid = tid;			\
-				mutex_exit(&(sqp)->sq_lock);		\
-				return;					\
-		} else {						\
-			if ((sqp)->sq_state & SQS_TMO_PROG) {		\
-				(sqp)->sq_state &= ~SQS_TMO_PROG;	\
-				mutex_exit(&(sqp)->sq_lock);		\
-				(void) untimeout(tid);			\
-			} else {					\
-				/*					\
-				 * The timer fired before we could 	\
-				 * reacquire the sq_lock. squeue_fire	\
-				 * removes the SQS_TMO_PROG flag	\
-				 * and we don't need to	do anything	\
-				 * else.				\
-				 */					\
-				mutex_exit(&(sqp)->sq_lock);		\
-			}						\
-		}							\
-	} else {							\
-		/*							\
-		 * Schedule the worker thread.				\
-		 */							\
-		(sqp)->sq_awaken = lbolt;				\
-		cv_signal(&(sqp)->sq_async);				\
-		mutex_exit(&(sqp)->sq_lock);				\
-	}								\
-	ASSERT(MUTEX_NOT_HELD(&(sqp)->sq_lock)); 			\
-}
-
-#define	ENQUEUE_MP(sqp, mp, proc, arg) {			\
-	/*							\
-	 * Enque our mblk.					\
-	 */							\
-	(mp)->b_queue = NULL;					\
-	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));			\
-	ASSERT((mp)->b_prev == NULL && (mp)->b_next == NULL); 	\
-	(mp)->b_queue = (queue_t *)(proc);			\
-	(mp)->b_prev = (mblk_t *)(arg);				\
-								\
-	if ((sqp)->sq_last != NULL)				\
-		(sqp)->sq_last->b_next = (mp);			\
-	else							\
-		(sqp)->sq_first = (mp);				\
-	(sqp)->sq_last = (mp);					\
-	(sqp)->sq_count++;					\
-	ASSERT((sqp)->sq_count > 0);				\
-	DTRACE_PROBE2(squeue__enqueue, squeue_t *, sqp,		\
-	    mblk_t *, mp);					\
-}
-
+#define	MAX_BYTES_TO_PICKUP	150000
 
 #define	ENQUEUE_CHAIN(sqp, mp, tail, cnt) {			\
 	/*							\
@@ -390,89 +163,120 @@ struct squeue_kstat {
 								\
 }
 
-#define	SQS_POLLING_ON(sqp, rx_ring) {				\
-	ASSERT(rx_ring != NULL);				\
+#define	SQS_POLLING_ON(sqp, sq_poll_capable, rx_ring) {		\
 	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));			\
-	rx_ring->rr_blank(rx_ring->rr_handle,			\
-	    MIN((sqp->sq_avg_drain_time * sqp->sq_count),	\
-		rx_ring->rr_max_blank_time),			\
-		rx_ring->rr_max_pkt_cnt);			\
-	rx_ring->rr_poll_state |= ILL_POLLING;			\
-	rx_ring->rr_poll_time = lbolt;				\
+	if (sq_poll_capable) {					\
+		ASSERT(rx_ring != NULL);			\
+		ASSERT(sqp->sq_state & SQS_POLL_CAPAB);		\
+		if (!(sqp->sq_state & SQS_POLLING)) {		\
+			sqp->sq_state |= SQS_POLLING;		\
+			rx_ring->rr_intr_disable(rx_ring->rr_intr_handle); \
+		}						\
+	}							\
 }
 
+#define	SQS_POLLING_OFF(sqp, sq_poll_capable, rx_ring) {	\
+	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));			\
+	if (sq_poll_capable) {					\
+		ASSERT(rx_ring != NULL);			\
+		ASSERT(sqp->sq_state & SQS_POLL_CAPAB);		\
+		if (sqp->sq_state & SQS_POLLING) {		\
+			sqp->sq_state &= ~SQS_POLLING;		\
+			rx_ring->rr_intr_enable(rx_ring->rr_intr_handle); \
+		}						\
+	}							\
+}
 
-#define	SQS_POLLING_OFF(sqp, rx_ring) {				\
-	ASSERT(rx_ring != NULL);				\
+#define	SQS_POLL_RING(sqp, sq_poll_capable) {			\
 	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));			\
-	rx_ring->rr_blank(rx_ring->rr_handle,			\
-	    rx_ring->rr_min_blank_time,				\
-	    rx_ring->rr_min_pkt_cnt);				\
+	if (sq_poll_capable) {					\
+		ASSERT(sqp->sq_state & SQS_POLL_CAPAB);		\
+		if (!(sqp->sq_state & SQS_GET_PKTS)) {		\
+			sqp->sq_state |= SQS_GET_PKTS;		\
+			cv_signal(&sqp->sq_poll_cv);		\
+		}						\
+	}							\
 }
 
+#ifdef DEBUG
+#define	SQUEUE_DBG_SET(sqp, mp, proc, connp, tag) {		\
+	(sqp)->sq_curmp = (mp);					\
+	(sqp)->sq_curproc = (proc);				\
+	(sqp)->sq_connp = (connp);				\
+	(mp)->b_tag = (sqp)->sq_tag = (tag);			\
+}
+
+#define	SQUEUE_DBG_CLEAR(sqp)	{				\
+	(sqp)->sq_curmp = NULL;					\
+	(sqp)->sq_curproc = NULL;				\
+	(sqp)->sq_connp = NULL;					\
+}
+#else
+#define	SQUEUE_DBG_SET(sqp, mp, proc, connp, tag)
+#define	SQUEUE_DBG_CLEAR(sqp)
+#endif
+
 void
 squeue_init(void)
 {
 	squeue_cache = kmem_cache_create("squeue_cache",
 	    sizeof (squeue_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
 
-	squeue_intrdrain_ns = squeue_intrdrain_ms * SQUEUE_MSEC_TO_NSEC;
-	squeue_writerdrain_ns = squeue_writerdrain_ms * SQUEUE_MSEC_TO_NSEC;
-	squeue_workerdrain_ns = squeue_workerdrain_ms * SQUEUE_MSEC_TO_NSEC;
+	squeue_drain_ns = squeue_drain_ms * SQUEUE_MSEC_TO_NSEC;
 	squeue_workerwait_tick = MSEC_TO_TICK_ROUNDUP(squeue_workerwait_ms);
 }
 
 /* ARGSUSED */
 squeue_t *
-squeue_create(char *name, processorid_t bind, clock_t wait, pri_t pri)
+squeue_create(clock_t wait, pri_t pri)
 {
 	squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP);
 
 	bzero(sqp, sizeof (squeue_t));
-	(void) strncpy(sqp->sq_name, name, SQ_NAMELEN + 1);
-	sqp->sq_name[SQ_NAMELEN] = '\0';
-
-	sqp->sq_bind = bind;
+	sqp->sq_bind = PBIND_NONE;
+	sqp->sq_priority = pri;
 	sqp->sq_wait = MSEC_TO_TICK(wait);
-	sqp->sq_avg_drain_time =
-	    drv_hztousec(NSEC_TO_TICK_ROUNDUP(squeue_intrdrain_ns)) /
-	    NSEC_TO_TICK_ROUNDUP(squeue_intrdrain_ns);
-
-#if SQUEUE_PROFILE
-	if ((sqp->sq_kstat = kstat_create("ip", bind, name,
-		"net", KSTAT_TYPE_NAMED,
-		sizeof (squeue_kstat) / sizeof (kstat_named_t),
-		KSTAT_FLAG_VIRTUAL)) != NULL) {
-		sqp->sq_kstat->ks_lock = &squeue_kstat_lock;
-		sqp->sq_kstat->ks_data = &squeue_kstat;
-		sqp->sq_kstat->ks_update = squeue_kstat_update;
-		sqp->sq_kstat->ks_private = sqp;
-		kstat_install(sqp->sq_kstat);
-	}
-#endif
-
 	sqp->sq_worker = thread_create(NULL, 0, squeue_worker,
 	    sqp, 0, &p0, TS_RUN, pri);
 
+	sqp->sq_poll_thr = thread_create(NULL, 0, squeue_polling_thread,
+	    sqp, 0, &p0, TS_RUN, pri);
+
+	sqp->sq_enter = squeue_enter;
+	sqp->sq_drain = squeue_drain;
+
 	return (sqp);
 }
 
-/* ARGSUSED */
+/*
+ * Bind squeue worker thread to the specified CPU, given by CPU id.
+ * If the CPU id  value is -1, bind the worker thread to the value
+ * specified in sq_bind field. If a thread is already bound to a
+ * different CPU, unbind it from the old CPU and bind to the new one.
+ */
+
 void
 squeue_bind(squeue_t *sqp, processorid_t bind)
 {
-	ASSERT(bind == -1);
-
 	mutex_enter(&sqp->sq_lock);
+	ASSERT(sqp->sq_bind != PBIND_NONE || bind != PBIND_NONE);
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
 	if (sqp->sq_state & SQS_BOUND) {
-		mutex_exit(&sqp->sq_lock);
-		return;
+		if (sqp->sq_bind == bind) {
+			mutex_exit(&sqp->sq_lock);
+			return;
+		}
+		thread_affinity_clear(sqp->sq_worker);
+	} else {
+		sqp->sq_state |= SQS_BOUND;
 	}
 
-	sqp->sq_state |= SQS_BOUND;
-	mutex_exit(&sqp->sq_lock);
+	if (bind != PBIND_NONE)
+		sqp->sq_bind = bind;
 
 	thread_affinity_set(sqp->sq_worker, sqp->sq_bind);
+	mutex_exit(&sqp->sq_lock);
 }
 
 void
@@ -485,9 +289,98 @@ squeue_unbind(squeue_t *sqp)
 	}
 
 	sqp->sq_state &= ~SQS_BOUND;
+	thread_affinity_clear(sqp->sq_worker);
 	mutex_exit(&sqp->sq_lock);
+}
 
-	thread_affinity_clear(sqp->sq_worker);
+void
+squeue_worker_wakeup(squeue_t *sqp)
+{
+	timeout_id_t tid = (sqp)->sq_tid;
+
+	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));
+
+	if (sqp->sq_wait == 0) {
+		ASSERT(tid == 0);
+		ASSERT(!(sqp->sq_state & SQS_TMO_PROG));
+		sqp->sq_awaken = lbolt;
+		cv_signal(&sqp->sq_worker_cv);
+		mutex_exit(&sqp->sq_lock);
+		return;
+	}
+
+	/*
+	 * Queue isn't being processed, so take
+	 * any post enqueue actions needed before leaving.
+	 */
+	if (tid != 0) {
+		/*
+		 * Waiting for an enter() to process mblk(s).
+		 */
+		clock_t	waited = lbolt - sqp->sq_awaken;
+
+		if (TICK_TO_MSEC(waited) >= sqp->sq_wait) {
+			/*
+			 * Times up and have a worker thread
+			 * waiting for work, so schedule it.
+			 */
+			sqp->sq_tid = 0;
+			sqp->sq_awaken = lbolt;
+			cv_signal(&sqp->sq_worker_cv);
+			mutex_exit(&sqp->sq_lock);
+			(void) untimeout(tid);
+			return;
+		}
+		mutex_exit(&sqp->sq_lock);
+		return;
+	} else if (sqp->sq_state & SQS_TMO_PROG) {
+		mutex_exit(&sqp->sq_lock);
+		return;
+	} else {
+		clock_t	wait = sqp->sq_wait;
+		/*
+		 * Wait up to sqp->sq_wait ms for an
+		 * enter() to process this queue. We
+		 * don't want to contend on timeout locks
+		 * with sq_lock held for performance reasons,
+		 * so drop the sq_lock before calling timeout
+		 * but we need to check if timeout is required
+		 * after re acquiring the sq_lock. Once
+		 * the sq_lock is dropped, someone else could
+		 * have processed the packet or the timeout could
+		 * have already fired.
+		 */
+		sqp->sq_state |= SQS_TMO_PROG;
+		mutex_exit(&sqp->sq_lock);
+		tid = timeout(squeue_fire, sqp, wait);
+		mutex_enter(&sqp->sq_lock);
+		/* Check again if we still need the timeout */
+		if (((sqp->sq_state & (SQS_PROC|SQS_TMO_PROG)) ==
+		    SQS_TMO_PROG) && (sqp->sq_tid == 0) &&
+		    (sqp->sq_first != NULL)) {
+				sqp->sq_state &= ~SQS_TMO_PROG;
+				sqp->sq_tid = tid;
+				mutex_exit(&sqp->sq_lock);
+				return;
+		} else {
+			if (sqp->sq_state & SQS_TMO_PROG) {
+				sqp->sq_state &= ~SQS_TMO_PROG;
+				mutex_exit(&sqp->sq_lock);
+				(void) untimeout(tid);
+			} else {
+				/*
+				 * The timer fired before we could
+				 * reacquire the sq_lock. squeue_fire
+				 * removes the SQS_TMO_PROG flag
+				 * and we don't need to	do anything
+				 * else.
+				 */
+				mutex_exit(&sqp->sq_lock);
+			}
+		}
+	}
+
+	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
 }
 
 /*
@@ -500,18 +393,20 @@ squeue_unbind(squeue_t *sqp)
  *
  * The proc and arg for each mblk is already stored in the mblk in
  * appropriate places.
+ *
+ * The process_flag specifies if we are allowed to process the mblk
+ * and drain in the entering thread context. If process_flag is
+ * SQ_FILL, then we just queue the mblk and return (after signaling
+ * the worker thread if no one else is processing the squeue).
  */
+/* ARGSUSED */
 void
-squeue_enter_chain(squeue_t *sqp, mblk_t *mp, mblk_t *tail,
-    uint32_t cnt, uint8_t tag)
+squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
+    int process_flag, uint8_t tag)
 {
-	int		interrupt = servicing_interrupt();
-	void 		*arg;
+	conn_t		*connp;
 	sqproc_t	proc;
 	hrtime_t	now;
-#if SQUEUE_PROFILE
-	hrtime_t 	start, delta;
-#endif
 
 	ASSERT(sqp != NULL);
 	ASSERT(mp != NULL);
@@ -520,355 +415,111 @@ squeue_enter_chain(squeue_t *sqp, mblk_t *mp, mblk_t *tail,
 	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
 
 	mutex_enter(&sqp->sq_lock);
-	if (!(sqp->sq_state & SQS_PROC)) {
+
+	/*
+	 * Try to process the packet if SQ_FILL flag is not set and
+	 * we are allowed to process the squeue. The SQ_NODRAIN is
+	 * ignored if the packet chain consists of more than 1 packet.
+	 */
+	if (!(sqp->sq_state & SQS_PROC) && ((process_flag == SQ_PROCESS) ||
+	    (process_flag == SQ_NODRAIN && sqp->sq_first == NULL))) {
 		/*
 		 * See if anything is already queued. If we are the
 		 * first packet, do inline processing else queue the
 		 * packet and do the drain.
 		 */
-		sqp->sq_run = curthread;
 		if (sqp->sq_first == NULL && cnt == 1) {
 			/*
 			 * Fast-path, ok to process and nothing queued.
 			 */
 			sqp->sq_state |= (SQS_PROC|SQS_FAST);
+			sqp->sq_run = curthread;
 			mutex_exit(&sqp->sq_lock);
 
 			/*
 			 * We are the chain of 1 packet so
 			 * go through this fast path.
 			 */
-			arg = mp->b_prev;
+			ASSERT(mp->b_prev != NULL);
+			ASSERT(mp->b_queue != NULL);
+			connp = (conn_t *)mp->b_prev;
 			mp->b_prev = NULL;
 			proc = (sqproc_t)mp->b_queue;
 			mp->b_queue = NULL;
-
-			ASSERT(proc != NULL);
-			ASSERT(arg != NULL);
+			ASSERT(proc != NULL && connp != NULL);
 			ASSERT(mp->b_next == NULL);
 
-#if SQUEUE_DEBUG
-			sqp->sq_isintr = interrupt;
-			sqp->sq_curmp = mp;
-			sqp->sq_curproc = proc;
-			sqp->sq_connp = arg;
-			mp->b_tag = sqp->sq_tag = tag;
-#endif
-#if SQUEUE_PROFILE
-			if (SQ_PROFILING(sqp)) {
-				if (interrupt)
-					SQSTAT(sqp, sq_npackets_intr);
-				else
-					SQSTAT(sqp, sq_npackets_other);
-				start = gethrtime();
-			}
-#endif
-			((conn_t *)arg)->conn_on_sqp = B_TRUE;
-			DTRACE_PROBE3(squeue__proc__start, squeue_t *,
-			    sqp, mblk_t *, mp, conn_t *, arg);
-			(*proc)(arg, mp, sqp);
-			DTRACE_PROBE2(squeue__proc__end, squeue_t *,
-			    sqp, conn_t *, arg);
-			((conn_t *)arg)->conn_on_sqp = B_FALSE;
-
-#if SQUEUE_PROFILE
-			if (SQ_PROFILING(sqp)) {
-				delta = gethrtime() - start;
-				if (interrupt)
-					SQDELTA(sqp, sq_time_intr, delta);
-				else
-					SQDELTA(sqp, sq_time_other, delta);
-			}
-#endif
-#if SQUEUE_DEBUG
-			sqp->sq_curmp = NULL;
-			sqp->sq_curproc = NULL;
-			sqp->sq_connp = NULL;
-			sqp->sq_isintr = 0;
-#endif
-
-			CONN_DEC_REF((conn_t *)arg);
-			ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
-			mutex_enter(&sqp->sq_lock);
-			sqp->sq_state &= ~(SQS_PROC|SQS_FAST);
-			if (sqp->sq_first == NULL) {
-				/*
-				 * We processed inline our packet and
-				 * nothing new has arrived. We are done.
-				 */
-				sqp->sq_run = NULL;
-				mutex_exit(&sqp->sq_lock);
-				return;
-			} else if (sqp->sq_bind != CPU->cpu_id) {
-				/*
-				 * If the current thread is not running
-				 * on the CPU to which this squeue is bound,
-				 * then don't allow it to drain.
-				 */
-				sqp->sq_run = NULL;
-				SQUEUE_WORKER_WAKEUP(sqp);
-				return;
-			}
-		} else {
-			ENQUEUE_CHAIN(sqp, mp, tail, cnt);
-#if SQUEUE_DEBUG
-			mp->b_tag = tag;
-#endif
-#if SQUEUE_PROFILE
-			if (SQ_PROFILING(sqp)) {
-				if (servicing_interrupt())
-					SQSTAT(sqp, sq_nqueued_intr);
-				else
-					SQSTAT(sqp, sq_nqueued_other);
-				if (sqp->sq_stats.sq_max_qlen < sqp->sq_count)
-					sqp->sq_stats.sq_max_qlen =
-					    sqp->sq_count;
-			}
-#endif
-		}
-
-		/*
-		 * We are here because either we couldn't do inline
-		 * processing (because something was already queued),
-		 * or we had a chanin of more than one packet,
-		 * or something else arrived after we were done with
-		 * inline processing.
-		 */
-		ASSERT(MUTEX_HELD(&sqp->sq_lock));
-		ASSERT(sqp->sq_first != NULL);
-
-#if SQUEUE_PROFILE
-		if (SQ_PROFILING(sqp)) {
-			start = gethrtime();
-		}
-#endif
-#if SQUEUE_DEBUG
-		sqp->sq_isintr = interrupt;
-#endif
-
-		now = gethrtime();
-		if (interrupt) {
-			squeue_drain(sqp, SQS_ENTER, now +
-			    squeue_intrdrain_ns);
-		} else {
-			squeue_drain(sqp, SQS_USER, now +
-			    squeue_writerdrain_ns);
-		}
-
-#if SQUEUE_PROFILE
-		if (SQ_PROFILING(sqp)) {
-			delta = gethrtime() - start;
-			if (interrupt)
-				SQDELTA(sqp, sq_time_intr, delta);
-			else
-				SQDELTA(sqp, sq_time_other, delta);
-		}
-#endif
-#if SQUEUE_DEBUG
-		sqp->sq_isintr = 0;
-#endif
-
-		/*
-		 * If we didn't do a complete drain, the worker
-		 * thread was already signalled by squeue_drain.
-		 */
-		sqp->sq_run = NULL;
-		mutex_exit(&sqp->sq_lock);
-		return;
-	} else {
-		ASSERT(sqp->sq_run != NULL);
-		/*
-		 * Queue is already being processed. Just enqueue
-		 * the packet and go away.
-		 */
-#if SQUEUE_DEBUG
-		mp->b_tag = tag;
-#endif
-#if SQUEUE_PROFILE
-		if (SQ_PROFILING(sqp)) {
-			if (servicing_interrupt())
-				SQSTAT(sqp, sq_nqueued_intr);
-			else
-				SQSTAT(sqp, sq_nqueued_other);
-			if (sqp->sq_stats.sq_max_qlen < sqp->sq_count)
-				sqp->sq_stats.sq_max_qlen = sqp->sq_count;
-		}
-#endif
-
-		ENQUEUE_CHAIN(sqp, mp, tail, cnt);
-		mutex_exit(&sqp->sq_lock);
-		return;
-	}
-}
-
-/*
- * squeue_enter() - enter squeue *sqp with mblk *mp with argument of *arg.
- */
-void
-squeue_enter(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg,
-    uint8_t tag)
-{
-	int	interrupt = servicing_interrupt();
-	hrtime_t now;
-#if SQUEUE_PROFILE
-	hrtime_t start, delta;
-#endif
-#if SQUEUE_DEBUG
-	conn_t 	*connp = (conn_t *)arg;
-	ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
-	ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
-#endif
-
-	ASSERT(proc != NULL);
-	ASSERT(sqp != NULL);
-	ASSERT(mp != NULL);
-	ASSERT(mp->b_next == NULL);
-	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
-
-	mutex_enter(&sqp->sq_lock);
-	if (!(sqp->sq_state & SQS_PROC)) {
-		/*
-		 * See if anything is already queued. If we are the
-		 * first packet, do inline processing else queue the
-		 * packet and do the drain.
-		 */
-		sqp->sq_run = curthread;
-		if (sqp->sq_first == NULL) {
 			/*
-			 * Fast-path, ok to process and nothing queued.
+			 * Handle squeue switching. More details in the
+			 * block comment at the top of the file
 			 */
-			sqp->sq_state |= (SQS_PROC|SQS_FAST);
-			mutex_exit(&sqp->sq_lock);
-
-#if SQUEUE_DEBUG
-			sqp->sq_isintr = interrupt;
-			sqp->sq_curmp = mp;
-			sqp->sq_curproc = proc;
-			sqp->sq_connp = connp;
-			mp->b_tag = sqp->sq_tag = tag;
-#endif
-#if SQUEUE_PROFILE
-			if (SQ_PROFILING(sqp)) {
-				if (interrupt)
-					SQSTAT(sqp, sq_npackets_intr);
-				else
-					SQSTAT(sqp, sq_npackets_other);
-				start = gethrtime();
+			if (connp->conn_sqp == sqp) {
+				SQUEUE_DBG_SET(sqp, mp, proc, connp,
+				    tag);
+				connp->conn_on_sqp = B_TRUE;
+				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
+				    sqp, mblk_t *, mp, conn_t *, connp);
+				(*proc)(connp, mp, sqp);
+				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
+				    sqp, conn_t *, connp);
+				connp->conn_on_sqp = B_FALSE;
+				SQUEUE_DBG_CLEAR(sqp);
+				CONN_DEC_REF(connp);
+			} else {
+				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
+				    connp, SQ_FILL, SQTAG_SQUEUE_CHANGE);
 			}
-#endif
-			((conn_t *)arg)->conn_on_sqp = B_TRUE;
-			DTRACE_PROBE3(squeue__proc__start, squeue_t *,
-			    sqp, mblk_t *, mp, conn_t *, arg);
-			(*proc)(arg, mp, sqp);
-			DTRACE_PROBE2(squeue__proc__end, squeue_t *,
-			    sqp, conn_t *, arg);
-			((conn_t *)arg)->conn_on_sqp = B_FALSE;
-
-#if SQUEUE_PROFILE
-			if (SQ_PROFILING(sqp)) {
-				delta = gethrtime() - start;
-				if (interrupt)
-					SQDELTA(sqp, sq_time_intr, delta);
-				else
-					SQDELTA(sqp, sq_time_other, delta);
-			}
-#endif
-#if SQUEUE_DEBUG
-			sqp->sq_curmp = NULL;
-			sqp->sq_curproc = NULL;
-			sqp->sq_connp = NULL;
-			sqp->sq_isintr = 0;
-#endif
-
-			CONN_DEC_REF((conn_t *)arg);
 			ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
 			mutex_enter(&sqp->sq_lock);
 			sqp->sq_state &= ~(SQS_PROC|SQS_FAST);
-			if (sqp->sq_first == NULL) {
+			sqp->sq_run = NULL;
+			if (sqp->sq_first == NULL ||
+			    process_flag == SQ_NODRAIN) {
+				if (sqp->sq_first != NULL) {
+					squeue_worker_wakeup(sqp);
+					return;
+				}
 				/*
-				 * We processed inline our packet and
-				 * nothing new has arrived. We are done.
+				 * We processed inline our packet and nothing
+				 * new has arrived. We are done. In case any
+				 * control actions are pending, wake up the
+				 * worker.
 				 */
-				sqp->sq_run = NULL;
+				if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
+					cv_signal(&sqp->sq_worker_cv);
 				mutex_exit(&sqp->sq_lock);
 				return;
-			} else if (sqp->sq_bind != CPU->cpu_id) {
-				/*
-				 * If the current thread is not running
-				 * on the CPU to which this squeue is bound,
-				 * then don't allow it to drain.
-				 */
-				sqp->sq_run = NULL;
-				SQUEUE_WORKER_WAKEUP(sqp);
-				return;
 			}
 		} else {
-			ENQUEUE_MP(sqp, mp, proc, arg);
-#if SQUEUE_DEBUG
+			ENQUEUE_CHAIN(sqp, mp, tail, cnt);
+#ifdef DEBUG
 			mp->b_tag = tag;
 #endif
-#if SQUEUE_PROFILE
-			if (SQ_PROFILING(sqp)) {
-				if (servicing_interrupt())
-					SQSTAT(sqp, sq_nqueued_intr);
-				else
-					SQSTAT(sqp, sq_nqueued_other);
-				if (sqp->sq_stats.sq_max_qlen < sqp->sq_count)
-					sqp->sq_stats.sq_max_qlen =
-					    sqp->sq_count;
-			}
-#endif
 		}
-
 		/*
 		 * We are here because either we couldn't do inline
-		 * processing (because something was already queued)
+		 * processing (because something was already queued),
+		 * or we had a chain of more than one packet,
 		 * or something else arrived after we were done with
 		 * inline processing.
 		 */
 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
 		ASSERT(sqp->sq_first != NULL);
-
-#if SQUEUE_PROFILE
-		if (SQ_PROFILING(sqp)) {
-			start = gethrtime();
-		}
-#endif
-#if SQUEUE_DEBUG
-		sqp->sq_isintr = interrupt;
-#endif
-
 		now = gethrtime();
-		if (interrupt) {
-			squeue_drain(sqp, SQS_ENTER, now +
-			    squeue_intrdrain_ns);
-		} else {
-			squeue_drain(sqp, SQS_USER, now +
-			    squeue_writerdrain_ns);
-		}
-
-#if SQUEUE_PROFILE
-		if (SQ_PROFILING(sqp)) {
-			delta = gethrtime() - start;
-			if (interrupt)
-				SQDELTA(sqp, sq_time_intr, delta);
-			else
-				SQDELTA(sqp, sq_time_other, delta);
-		}
-#endif
-#if SQUEUE_DEBUG
-		sqp->sq_isintr = 0;
-#endif
+		sqp->sq_drain(sqp, SQS_ENTER, now + squeue_drain_ns);
 
 		/*
 		 * If we didn't do a complete drain, the worker
 		 * thread was already signalled by squeue_drain.
+		 * In case any control actions are pending, wake
+		 * up the worker.
 		 */
 		sqp->sq_run = NULL;
+		if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
+			cv_signal(&sqp->sq_worker_cv);
 		mutex_exit(&sqp->sq_lock);
 		return;
 	} else {
-		ASSERT(sqp->sq_run != NULL);
 		/*
 		 * We let a thread processing a squeue reenter only
 		 * once. This helps the case of incoming connection
@@ -878,168 +529,42 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg,
 		 * loopback connection where the two ends are bound
 		 * to the same squeue (which is typical on single
 		 * CPU machines).
+		 *
 		 * We let the thread reenter only once for the fear
 		 * of stack getting blown with multiple traversal.
 		 */
+		connp = (conn_t *)mp->b_prev;
 		if (!(sqp->sq_state & SQS_REENTER) &&
-		    (sqp->sq_run == curthread) && sqp->sq_first == NULL &&
-		    (((conn_t *)arg)->conn_on_sqp == B_FALSE)) {
+		    (process_flag != SQ_FILL) && (sqp->sq_first == NULL) &&
+		    (sqp->sq_run == curthread) && (cnt == 1) &&
+		    (connp->conn_on_sqp == B_FALSE)) {
 			sqp->sq_state |= SQS_REENTER;
 			mutex_exit(&sqp->sq_lock);
 
-			((conn_t *)arg)->conn_on_sqp = B_TRUE;
-			DTRACE_PROBE3(squeue__proc__start, squeue_t *,
-			    sqp, mblk_t *, mp, conn_t *, arg);
-			(*proc)(arg, mp, sqp);
-			DTRACE_PROBE2(squeue__proc__end, squeue_t *,
-			    sqp, conn_t *, arg);
-			((conn_t *)arg)->conn_on_sqp = B_FALSE;
-			CONN_DEC_REF((conn_t *)arg);
-
-			mutex_enter(&sqp->sq_lock);
-			sqp->sq_state &= ~SQS_REENTER;
-			mutex_exit(&sqp->sq_lock);
-			return;
-		}
-		/*
-		 * Queue is already being processed. Just enqueue
-		 * the packet and go away.
-		 */
-#if SQUEUE_DEBUG
-		mp->b_tag = tag;
-#endif
-#if SQUEUE_PROFILE
-		if (SQ_PROFILING(sqp)) {
-			if (servicing_interrupt())
-				SQSTAT(sqp, sq_nqueued_intr);
-			else
-				SQSTAT(sqp, sq_nqueued_other);
-			if (sqp->sq_stats.sq_max_qlen < sqp->sq_count)
-				sqp->sq_stats.sq_max_qlen = sqp->sq_count;
-		}
-#endif
-
-		ENQUEUE_MP(sqp, mp, proc, arg);
-		mutex_exit(&sqp->sq_lock);
-		return;
-	}
-}
-
-void
-squeue_enter_nodrain(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg,
-    uint8_t tag)
-{
-	int		interrupt = servicing_interrupt();
-	boolean_t	being_processed;
-#if SQUEUE_DEBUG
-	conn_t 		*connp = (conn_t *)arg;
-#endif
-#if SQUEUE_PROFILE
-	hrtime_t 	start, delta;
-#endif
+			ASSERT(mp->b_prev != NULL);
+			ASSERT(mp->b_queue != NULL);
 
-	ASSERT(proc != NULL);
-	ASSERT(sqp != NULL);
-	ASSERT(mp != NULL);
-	ASSERT(mp->b_next == NULL);
-	ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
-	ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
-	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
-
-	mutex_enter(&sqp->sq_lock);
-
-	being_processed = (sqp->sq_state & SQS_PROC);
-	if (!being_processed && (sqp->sq_first == NULL)) {
-		/*
-		 * Fast-path, ok to process and nothing queued.
-		 */
-		sqp->sq_state |= (SQS_PROC|SQS_FAST);
-		sqp->sq_run = curthread;
-		mutex_exit(&sqp->sq_lock);
-
-#if SQUEUE_DEBUG
-		sqp->sq_isintr = interrupt;
-		sqp->sq_curmp = mp;
-		sqp->sq_curproc = proc;
-		sqp->sq_connp = connp;
-		mp->b_tag = sqp->sq_tag = tag;
-#endif
-
-#if SQUEUE_PROFILE
-		if (SQ_PROFILING(sqp)) {
-			if (interrupt)
-				SQSTAT(sqp, sq_npackets_intr);
-			else
-				SQSTAT(sqp, sq_npackets_other);
-			start = gethrtime();
-		}
-#endif
-
-		((conn_t *)arg)->conn_on_sqp = B_TRUE;
-		DTRACE_PROBE3(squeue__proc__start, squeue_t *,
-		    sqp, mblk_t *, mp, conn_t *, arg);
-		(*proc)(arg, mp, sqp);
-		DTRACE_PROBE2(squeue__proc__end, squeue_t *,
-		    sqp, conn_t *, arg);
-		((conn_t *)arg)->conn_on_sqp = B_FALSE;
-
-#if SQUEUE_DEBUG
-		sqp->sq_curmp = NULL;
-		sqp->sq_curproc = NULL;
-		sqp->sq_connp = NULL;
-		sqp->sq_isintr = 0;
-#endif
-#if SQUEUE_PROFILE
-		if (SQ_PROFILING(sqp)) {
-			delta = gethrtime() - start;
-			if (interrupt)
-				SQDELTA(sqp, sq_time_intr, delta);
-			else
-				SQDELTA(sqp, sq_time_other, delta);
-		}
-#endif
+			mp->b_prev = NULL;
+			proc = (sqproc_t)mp->b_queue;
+			mp->b_queue = NULL;
 
-		CONN_DEC_REF((conn_t *)arg);
-		mutex_enter(&sqp->sq_lock);
-		sqp->sq_state &= ~(SQS_PROC|SQS_FAST);
-		sqp->sq_run = NULL;
-		if (sqp->sq_first == NULL) {
 			/*
-			 * We processed inline our packet and
-			 * nothing new has arrived. We are done.
+			 * Handle squeue switching. More details in the
+			 * block comment at the top of the file
 			 */
-			mutex_exit(&sqp->sq_lock);
-		} else {
-			SQUEUE_WORKER_WAKEUP(sqp);
-		}
-		return;
-	} else {
-		/*
-		 * We let a thread processing a squeue reenter only
-		 * once. This helps the case of incoming connection
-		 * where a SYN-ACK-ACK that triggers the conn_ind
-		 * doesn't have to queue the packet if listener and
-		 * eager are on the same squeue. Also helps the
-		 * loopback connection where the two ends are bound
-		 * to the same squeue (which is typical on single
-		 * CPU machines).
-		 * We let the thread reenter only once for the fear
-		 * of stack getting blown with multiple traversal.
-		 */
-		if (being_processed && !(sqp->sq_state & SQS_REENTER) &&
-		    (sqp->sq_run == curthread) && sqp->sq_first == NULL &&
-		    (((conn_t *)arg)->conn_on_sqp == B_FALSE)) {
-			sqp->sq_state |= SQS_REENTER;
-			mutex_exit(&sqp->sq_lock);
-
-			((conn_t *)arg)->conn_on_sqp = B_TRUE;
-			DTRACE_PROBE3(squeue__proc__start, squeue_t *,
-			    sqp, mblk_t *, mp, conn_t *, arg);
-			(*proc)(arg, mp, sqp);
-			DTRACE_PROBE2(squeue__proc__end, squeue_t *,
-			    sqp, conn_t *, arg);
-			((conn_t *)arg)->conn_on_sqp = B_FALSE;
-			CONN_DEC_REF((conn_t *)arg);
+			if (connp->conn_sqp == sqp) {
+				connp->conn_on_sqp = B_TRUE;
+				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
+				    sqp, mblk_t *, mp, conn_t *, connp);
+				(*proc)(connp, mp, sqp);
+				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
+				    sqp, conn_t *, connp);
+				connp->conn_on_sqp = B_FALSE;
+				CONN_DEC_REF(connp);
+			} else {
+				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
+				    connp, SQ_FILL, SQTAG_SQUEUE_CHANGE);
+			}
 
 			mutex_enter(&sqp->sq_lock);
 			sqp->sq_state &= ~SQS_REENTER;
@@ -1047,80 +572,32 @@ squeue_enter_nodrain(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg,
 			return;
 		}
 
-#if SQUEUE_DEBUG
+		/*
+		 * Queue is already being processed or there is already
+		 * one or more paquets on the queue. Enqueue the
+		 * packet and wakeup the squeue worker thread if the
+		 * squeue is not being processed.
+		 */
+#ifdef DEBUG
 		mp->b_tag = tag;
 #endif
-#if SQUEUE_PROFILE
-		if (SQ_PROFILING(sqp)) {
-			if (servicing_interrupt())
-				SQSTAT(sqp, sq_nqueued_intr);
-			else
-				SQSTAT(sqp, sq_nqueued_other);
-			if (sqp->sq_stats.sq_max_qlen < sqp->sq_count)
-				sqp->sq_stats.sq_max_qlen = sqp->sq_count;
-		}
-#endif
-		ENQUEUE_MP(sqp, mp, proc, arg);
-		if (being_processed) {
-			/*
-			 * Queue is already being processed.
-			 * No need to do anything.
-			 */
-			mutex_exit(&sqp->sq_lock);
+
+		ENQUEUE_CHAIN(sqp, mp, tail, cnt);
+		if (!(sqp->sq_state & SQS_PROC)) {
+			squeue_worker_wakeup(sqp);
 			return;
 		}
-		SQUEUE_WORKER_WAKEUP(sqp);
-	}
-}
-
-/*
- * squeue_fill() - fill squeue *sqp with mblk *mp with argument of *arg
- * without processing the squeue.
- */
-/* ARGSUSED */
-void
-squeue_fill(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void * arg,
-    uint8_t tag)
-{
-#if SQUEUE_DEBUG
-	conn_t *connp = (conn_t *)arg;
-#endif
-	ASSERT(proc != NULL);
-	ASSERT(sqp != NULL);
-	ASSERT(mp != NULL);
-	ASSERT(mp->b_next == NULL);
-	ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
-	ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
-
-	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
-	mutex_enter(&sqp->sq_lock);
-	ENQUEUE_MP(sqp, mp, proc, arg);
-#if SQUEUE_DEBUG
-	mp->b_tag = tag;
-#endif
-#if SQUEUE_PROFILE
-	if (SQ_PROFILING(sqp)) {
-		if (servicing_interrupt())
-			SQSTAT(sqp, sq_nqueued_intr);
-		else
-			SQSTAT(sqp, sq_nqueued_other);
-		if (sqp->sq_stats.sq_max_qlen < sqp->sq_count)
-			sqp->sq_stats.sq_max_qlen = sqp->sq_count;
-	}
-#endif
-
-	/*
-	 * If queue is already being processed. No need to do anything.
-	 */
-	if (sqp->sq_state & SQS_PROC) {
+		/*
+		 * In case any control actions are pending, wake
+		 * up the worker.
+		 */
+		if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
+			cv_signal(&sqp->sq_worker_cv);
 		mutex_exit(&sqp->sq_lock);
 		return;
 	}
-
-	SQUEUE_WORKER_WAKEUP(sqp);
 }
 
-
 /*
  * PRIVATE FUNCTIONS
  */
@@ -1151,7 +628,7 @@ squeue_fire(void *arg)
 
 	if (!(state & SQS_PROC)) {
 		sqp->sq_awaken = lbolt;
-		cv_signal(&sqp->sq_async);
+		cv_signal(&sqp->sq_worker_cv);
 	}
 	mutex_exit(&sqp->sq_lock);
 }
@@ -1159,64 +636,52 @@ squeue_fire(void *arg)
 static void
 squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire)
 {
-	mblk_t	*mp;
-	mblk_t 	*head;
-	sqproc_t proc;
-	conn_t	*connp;
-	clock_t	start = lbolt;
-	clock_t	drain_time;
-	timeout_id_t tid;
-	uint_t	cnt;
-	uint_t	total_cnt = 0;
+	mblk_t		*mp;
+	mblk_t 		*head;
+	sqproc_t 	proc;
+	conn_t		*connp;
+	timeout_id_t 	tid;
 	ill_rx_ring_t	*sq_rx_ring = sqp->sq_rx_ring;
-	int	interrupt = servicing_interrupt();
-	boolean_t poll_on = B_FALSE;
-	hrtime_t now;
+	hrtime_t 	now;
+	boolean_t	did_wakeup = B_FALSE;
+	boolean_t	sq_poll_capable;
 
+	sq_poll_capable = (sqp->sq_state & SQS_POLL_CAPAB) != 0;
+again:
 	ASSERT(mutex_owned(&sqp->sq_lock));
-	ASSERT(!(sqp->sq_state & SQS_PROC));
-
-#if SQUEUE_PROFILE
-	if (SQ_PROFILING(sqp)) {
-		if (interrupt)
-			SQSTAT(sqp, sq_ndrains_intr);
-		else if (!(proc_type & SQS_WORKER))
-			SQSTAT(sqp, sq_ndrains_other);
-		else
-			SQSTAT(sqp, sq_ndrains_worker);
-	}
-#endif
+	ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+	    SQS_POLL_QUIESCE_DONE)));
+
+	head = sqp->sq_first;
+	sqp->sq_first = NULL;
+	sqp->sq_last = NULL;
+	sqp->sq_count = 0;
 
 	if ((tid = sqp->sq_tid) != 0)
 		sqp->sq_tid = 0;
 
 	sqp->sq_state |= SQS_PROC | proc_type;
-	head = sqp->sq_first;
-	sqp->sq_first = NULL;
-	sqp->sq_last = NULL;
-	cnt = sqp->sq_count;
+
 
 	/*
 	 * We have backlog built up. Switch to polling mode if the
-	 * device underneath allows it. Need to do it only for
-	 * drain by non-interrupt thread so interrupts don't
-	 * come and disrupt us in between. If its a interrupt thread,
-	 * no need because most devices will not issue another
-	 * interrupt till this one returns.
+	 * device underneath allows it. Need to do it so that
+	 * more packets don't come in and disturb us (by contending
+	 * for sq_lock or higher priority thread preempting us).
+	 *
+	 * The worker thread is allowed to do active polling while we
+	 * just disable the interrupts for drain by non worker (kernel
+	 * or userland) threads so they can peacefully process the
+	 * packets during time allocated to them.
 	 */
-	if ((sqp->sq_state & SQS_POLL_CAPAB) && !(proc_type & SQS_ENTER) &&
-		(sqp->sq_count > squeue_worker_poll_min)) {
-		ASSERT(sq_rx_ring != NULL);
-		SQS_POLLING_ON(sqp, sq_rx_ring);
-		poll_on = B_TRUE;
-	}
-
+	SQS_POLLING_ON(sqp, sq_poll_capable, sq_rx_ring);
 	mutex_exit(&sqp->sq_lock);
 
 	if (tid != 0)
 		(void) untimeout(tid);
-again:
+
 	while ((mp = head) != NULL) {
+
 		head = mp->b_next;
 		mp->b_next = NULL;
 
@@ -1224,255 +689,548 @@ again:
 		mp->b_queue = NULL;
 		connp = (conn_t *)mp->b_prev;
 		mp->b_prev = NULL;
-#if SQUEUE_DEBUG
-		sqp->sq_curmp = mp;
-		sqp->sq_curproc = proc;
-		sqp->sq_connp = connp;
-		sqp->sq_tag = mp->b_tag;
-#endif
 
-#if SQUEUE_PROFILE
-		if (SQ_PROFILING(sqp)) {
-			if (interrupt)
-				SQSTAT(sqp, sq_npackets_intr);
-			else if (!(proc_type & SQS_WORKER))
-				SQSTAT(sqp, sq_npackets_other);
-			else
-				SQSTAT(sqp, sq_npackets_worker);
+		/*
+		 * Handle squeue switching. More details in the
+		 * block comment at the top of the file
+		 */
+		if (connp->conn_sqp == sqp) {
+			SQUEUE_DBG_SET(sqp, mp, proc, connp,
+			    mp->b_tag);
+			connp->conn_on_sqp = B_TRUE;
+			DTRACE_PROBE3(squeue__proc__start, squeue_t *,
+			    sqp, mblk_t *, mp, conn_t *, connp);
+			(*proc)(connp, mp, sqp);
+			DTRACE_PROBE2(squeue__proc__end, squeue_t *,
+			    sqp, conn_t *, connp);
+			connp->conn_on_sqp = B_FALSE;
+			CONN_DEC_REF(connp);
+		} else {
+			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp,
+			    SQ_FILL, SQTAG_SQUEUE_CHANGE);
 		}
-#endif
-
-		connp->conn_on_sqp = B_TRUE;
-		DTRACE_PROBE3(squeue__proc__start, squeue_t *,
-		    sqp, mblk_t *, mp, conn_t *, connp);
-		(*proc)(connp, mp, sqp);
-		DTRACE_PROBE2(squeue__proc__end, squeue_t *,
-		    sqp, conn_t *, connp);
-		connp->conn_on_sqp = B_FALSE;
-		CONN_DEC_REF(connp);
 	}
 
-
-#if SQUEUE_DEBUG
-	sqp->sq_curmp = NULL;
-	sqp->sq_curproc = NULL;
-	sqp->sq_connp = NULL;
-#endif
+	SQUEUE_DBG_CLEAR(sqp);
 
 	mutex_enter(&sqp->sq_lock);
-	sqp->sq_count -= cnt;
-	total_cnt += cnt;
 
+	/*
+	 * Check if there is still work to do (either more arrived or timer
+	 * expired). If we are the worker thread and we are polling capable,
+	 * continue doing the work since no one else is around to do the
+	 * work anyway (but signal the poll thread to retrieve some packets
+	 * in the meanwhile). If we are not the worker thread, just
+	 * signal the worker thread to take up the work if processing time
+	 * has expired.
+	 */
 	if (sqp->sq_first != NULL) {
-
-		now = gethrtime();
-		if (!expire || (now < expire)) {
-			/* More arrived and time not expired */
-			head = sqp->sq_first;
-			sqp->sq_first = NULL;
-			sqp->sq_last = NULL;
-			cnt = sqp->sq_count;
-			mutex_exit(&sqp->sq_lock);
-			goto again;
-		}
-
 		/*
-		 * If we are not worker thread and we
-		 * reached our time limit to do drain,
-		 * signal the worker thread to pick
-		 * up the work.
-		 * If we were the worker thread, then
-		 * we take a break to allow an interrupt
-		 * or writer to pick up the load.
+		 * Still more to process. If time quanta not expired, we
+		 * should let the drain go on. The worker thread is allowed
+		 * to drain as long as there is anything left.
 		 */
-		if (proc_type != SQS_WORKER) {
+		now = gethrtime();
+		if ((now < expire) || (proc_type == SQS_WORKER)) {
+			/*
+			 * If time not expired or we are worker thread and
+			 * this squeue is polling capable, continue to do
+			 * the drain.
+			 *
+			 * We turn off interrupts for all userland threads
+			 * doing drain but we do active polling only for
+			 * worker thread.
+			 */
+			if (proc_type == SQS_WORKER)
+				SQS_POLL_RING(sqp, sq_poll_capable);
+			goto again;
+		} else {
+			did_wakeup = B_TRUE;
 			sqp->sq_awaken = lbolt;
-			cv_signal(&sqp->sq_async);
+			cv_signal(&sqp->sq_worker_cv);
 		}
 	}
 
 	/*
-	 * Try to see if we can get a time estimate to process a packet.
-	 * Do it only in interrupt context since less chance of context
-	 * switch or pinning etc. to get a better estimate.
+	 * If the poll thread is already running, just return. The
+	 * poll thread continues to hold the proc and will finish
+	 * processing.
 	 */
-	if (interrupt && ((drain_time = (lbolt - start)) > 0))
-		sqp->sq_avg_drain_time = ((80 * sqp->sq_avg_drain_time) +
-		    (20 * (drv_hztousec(drain_time)/total_cnt)))/100;
-
-	sqp->sq_state &= ~(SQS_PROC | proc_type);
+	if (sqp->sq_state & SQS_GET_PKTS) {
+		ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+		    SQS_POLL_QUIESCE_DONE)));
+		sqp->sq_state &= ~proc_type;
+		return;
+	}
 
 	/*
-	 * If polling was turned on, turn it off and reduce the default
-	 * interrupt blank interval as well to bring new packets in faster
-	 * (reduces the latency when there is no backlog).
+	 *
+	 * If we are the worker thread and no work is left, send the poll
+	 * thread down once more to see if something arrived. Otherwise,
+	 * turn the interrupts back on and we are done.
 	 */
-	if (poll_on && (sqp->sq_state & SQS_POLL_CAPAB)) {
-		ASSERT(sq_rx_ring != NULL);
-		SQS_POLLING_OFF(sqp, sq_rx_ring);
+	if ((proc_type == SQS_WORKER) &&
+	    (sqp->sq_state & SQS_POLL_CAPAB)) {
+		/*
+		 * Do one last check to see if anything arrived
+		 * in the NIC. We leave the SQS_PROC set to ensure
+		 * that poll thread keeps the PROC and can decide
+		 * if it needs to turn polling off or continue
+		 * processing.
+		 *
+		 * If we drop the SQS_PROC here and poll thread comes
+		 * up empty handed, it can not safely turn polling off
+		 * since someone else could have acquired the PROC
+		 * and started draining. The previously running poll
+		 * thread and the current thread doing drain would end
+		 * up in a race for turning polling on/off and more
+		 * complex code would be required to deal with it.
+		 *
+		 * Its lot simpler for drain to hand the SQS_PROC to
+		 * poll thread (if running) and let poll thread finish
+		 * without worrying about racing with any other thread.
+		 */
+		ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+		    SQS_POLL_QUIESCE_DONE)));
+		SQS_POLL_RING(sqp, sq_poll_capable);
+		sqp->sq_state &= ~proc_type;
+	} else {
+		/*
+		 * The squeue is either not capable of polling or
+		 * poll thread already finished processing and didn't
+		 * find anything. Since there is nothing queued and
+		 * we already turn polling on (for all threads doing
+		 * drain), we should turn polling off and relinquish
+		 * the PROC.
+		 */
+		ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+		    SQS_POLL_QUIESCE_DONE)));
+		SQS_POLLING_OFF(sqp, sq_poll_capable, sq_rx_ring);
+		sqp->sq_state &= ~(SQS_PROC | proc_type);
+		if (!did_wakeup && sqp->sq_first != NULL) {
+			squeue_worker_wakeup(sqp);
+			mutex_enter(&sqp->sq_lock);
+		}
+		/*
+		 * If we are not the worker and there is a pending quiesce
+		 * event, wake up the worker
+		 */
+		if ((proc_type != SQS_WORKER) &&
+		    (sqp->sq_state & SQS_WORKER_THR_CONTROL))
+			cv_signal(&sqp->sq_worker_cv);
 	}
 }
 
+/*
+ * Quiesce, Restart, or Cleanup of the squeue poll thread.
+ *
+ * Quiesce and Restart: After an squeue poll thread has been quiesced, it does
+ * not attempt to poll the underlying soft ring any more. The quiesce is
+ * triggered by the mac layer when it wants to quiesce a soft ring. Typically
+ * control operations such as changing the fanout of a NIC or VNIC (dladm
+ * setlinkprop) need to quiesce data flow before changing the wiring.
+ * The operation is done by the mac layer, but it calls back into IP to
+ * quiesce the soft ring. After completing the operation (say increase or
+ * decrease of the fanout) the mac layer then calls back into IP to restart
+ * the quiesced soft ring.
+ *
+ * Cleanup: This is triggered when the squeue binding to a soft ring is
+ * removed permanently. Typically interface plumb and unplumb would trigger
+ * this. It can also be triggered from the mac layer when a soft ring is
+ * being deleted say as the result of a fanout reduction. Since squeues are
+ * never deleted, the cleanup marks the squeue as fit for recycling and
+ * moves it to the zeroth squeue set.
+ */
 static void
-squeue_worker(squeue_t *sqp)
+squeue_poll_thr_control(squeue_t *sqp)
+{
+	if (sqp->sq_state & SQS_POLL_THR_RESTART) {
+		/* Restart implies a previous quiesce */
+		ASSERT(sqp->sq_state & SQS_POLL_THR_QUIESCED);
+		sqp->sq_state &= ~(SQS_POLL_THR_QUIESCED |
+		    SQS_POLL_THR_RESTART);
+		sqp->sq_state |= SQS_POLL_CAPAB;
+		cv_signal(&sqp->sq_worker_cv);
+		return;
+	}
+
+	if (sqp->sq_state & SQS_POLL_THR_QUIESCE) {
+		sqp->sq_state |= SQS_POLL_THR_QUIESCED;
+		sqp->sq_state &= ~SQS_POLL_THR_QUIESCE;
+		cv_signal(&sqp->sq_worker_cv);
+		return;
+	}
+}
+
+/*
+ * POLLING Notes
+ *
+ * With polling mode, we want to do as much processing as we possibly can
+ * in worker thread context. The sweet spot is worker thread keeps doing
+ * work all the time in polling mode and writers etc. keep dumping packets
+ * to worker thread. Occassionally, we send the poll thread (running at
+ * lower priority to NIC to get the chain of packets to feed to worker).
+ * Sending the poll thread down to NIC is dependant on 3 criterions
+ *
+ * 1) Its always driven from squeue_drain and only if worker thread is
+ *	doing the drain.
+ * 2) We clear the backlog once and more packets arrived in between.
+ *	Before starting drain again, send the poll thread down if
+ *	the drain is being done by worker thread.
+ * 3) Before exiting the squeue_drain, if the poll thread is not already
+ *	working and we are the worker thread, try to poll one more time.
+ *
+ * For latency sake, we do allow any thread calling squeue_enter
+ * to process its packet provided:
+ *
+ * 1) Nothing is queued
+ * 2) If more packets arrived in between, the non worker thread are allowed
+ *	to do the drain till their time quanta expired provided SQS_GET_PKTS
+ *	wasn't set in between.
+ *
+ * Avoiding deadlocks with interrupts
+ * ==================================
+ *
+ * One of the big problem is that we can't send poll_thr down while holding
+ * the sq_lock since the thread can block. So we drop the sq_lock before
+ * calling sq_get_pkts(). We keep holding the SQS_PROC as long as the
+ * poll thread is running so that no other thread can acquire the
+ * perimeter in between. If the squeue_drain gets done (no more work
+ * left), it leaves the SQS_PROC set if poll thread is running.
+ */
+
+/*
+ * This is the squeue poll thread. In poll mode, it polls the underlying
+ * TCP softring and feeds packets into the squeue. The worker thread then
+ * drains the squeue. The poll thread also responds to control signals for
+ * quiesceing, restarting, or cleanup of an squeue. These are driven by
+ * control operations like plumb/unplumb or as a result of dynamic Rx ring
+ * related operations that are driven from the mac layer.
+ */
+static void
+squeue_polling_thread(squeue_t *sqp)
 {
 	kmutex_t *lock = &sqp->sq_lock;
-	kcondvar_t *async = &sqp->sq_async;
+	kcondvar_t *async = &sqp->sq_poll_cv;
+	ip_mac_rx_t sq_get_pkts;
+	ip_accept_t ip_accept;
+	ill_rx_ring_t *sq_rx_ring;
+	ill_t *sq_ill;
+	mblk_t *head, *tail, *mp;
+	uint_t cnt;
+	void *sq_mac_handle;
 	callb_cpr_t cprinfo;
-	hrtime_t now;
-#if SQUEUE_PROFILE
-	hrtime_t start;
-#endif
+	size_t bytes_to_pickup;
+	uint32_t ctl_state;
 
-	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "nca");
+	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "sq_poll");
 	mutex_enter(lock);
 
 	for (;;) {
-		while (sqp->sq_first == NULL || (sqp->sq_state & SQS_PROC)) {
-			CALLB_CPR_SAFE_BEGIN(&cprinfo);
-still_wait:
-			cv_wait(async, lock);
-			if (sqp->sq_state & SQS_PROC) {
-				goto still_wait;
-			}
-			CALLB_CPR_SAFE_END(&cprinfo, lock);
+		CALLB_CPR_SAFE_BEGIN(&cprinfo);
+		cv_wait(async, lock);
+		CALLB_CPR_SAFE_END(&cprinfo, lock);
+
+		ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL |
+		    SQS_POLL_THR_QUIESCED);
+		if (ctl_state != 0) {
+			/*
+			 * If the squeue is quiesced, then wait for a control
+			 * request. A quiesced squeue must not poll the
+			 * underlying soft ring.
+			 */
+			if (ctl_state == SQS_POLL_THR_QUIESCED)
+				continue;
+			/*
+			 * Act on control requests to quiesce, cleanup or
+			 * restart an squeue
+			 */
+			squeue_poll_thr_control(sqp);
+			continue;
 		}
 
-#if SQUEUE_PROFILE
-		if (SQ_PROFILING(sqp)) {
-			start = gethrtime();
+		if (!(sqp->sq_state & SQS_POLL_CAPAB))
+			continue;
+
+		ASSERT((sqp->sq_state &
+		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
+		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
+
+poll_again:
+		sq_rx_ring = sqp->sq_rx_ring;
+		sq_get_pkts = sq_rx_ring->rr_rx;
+		sq_mac_handle = sq_rx_ring->rr_rx_handle;
+		ip_accept = sq_rx_ring->rr_ip_accept;
+		sq_ill = sq_rx_ring->rr_ill;
+		bytes_to_pickup = MAX_BYTES_TO_PICKUP;
+		mutex_exit(lock);
+		head = sq_get_pkts(sq_mac_handle, bytes_to_pickup);
+		mp = NULL;
+		if (head != NULL) {
+			/*
+			 * We got the packet chain from the mac layer. It
+			 * would be nice to be able to process it inline
+			 * for better performance but we need to give
+			 * IP a chance to look at this chain to ensure
+			 * that packets are really meant for this squeue
+			 * and do the IP processing.
+			 */
+			mp = ip_accept(sq_ill, sq_rx_ring, sqp, head,
+			    &tail, &cnt);
 		}
-#endif
+		mutex_enter(lock);
+		if (mp != NULL)
+			ENQUEUE_CHAIN(sqp, mp, tail, cnt);
 
-		ASSERT(squeue_workerdrain_ns != 0);
-		now = gethrtime();
-		sqp->sq_run = curthread;
-		squeue_drain(sqp, SQS_WORKER, now +  squeue_workerdrain_ns);
-		sqp->sq_run = NULL;
+		ASSERT((sqp->sq_state &
+		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
+		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
 
-		if (sqp->sq_first != NULL) {
+		if (sqp->sq_first != NULL && !(sqp->sq_state & SQS_WORKER)) {
 			/*
-			 * Doing too much processing by worker thread
-			 * in presense of interrupts can be sub optimal.
-			 * Instead, once a drain is done by worker thread
-			 * for squeue_writerdrain_ns (the reason we are
-			 * here), we force wait for squeue_workerwait_tick
-			 * before doing more processing even if sq_wait is
-			 * set to 0.
-			 *
-			 * This can be counterproductive for performance
-			 * if worker thread is the only means to process
-			 * the packets (interrupts or writers are not
-			 * allowed inside the squeue).
+			 * We have packets to process and worker thread
+			 * is not running.  Check to see if poll thread is
+			 * allowed to process. Let it do processing only if it
+			 * picked up some packets from the NIC otherwise
+			 * wakeup the worker thread.
 			 */
-			if (sqp->sq_tid == 0 &&
-			    !(sqp->sq_state & SQS_TMO_PROG)) {
-				timeout_id_t	tid;
+			if (mp != NULL) {
+				hrtime_t  now;
+
+				now = gethrtime();
+				sqp->sq_run = curthread;
+				sqp->sq_drain(sqp, SQS_POLL_PROC, now +
+				    squeue_drain_ns);
+				sqp->sq_run = NULL;
+
+				if (sqp->sq_first == NULL)
+					goto poll_again;
 
-				sqp->sq_state |= SQS_TMO_PROG;
-				mutex_exit(&sqp->sq_lock);
-				tid = timeout(squeue_fire, sqp,
-				    squeue_workerwait_tick);
-				mutex_enter(&sqp->sq_lock);
 				/*
-				 * Check again if we still need
-				 * the timeout
+				 * Couldn't do the entire drain because the
+				 * time limit expired, let the
+				 * worker thread take over.
 				 */
-				if (((sqp->sq_state & (SQS_TMO_PROG|SQS_PROC))
-				    == SQS_TMO_PROG) && (sqp->sq_tid == 0) &&
-				    (sqp->sq_first != NULL)) {
-					sqp->sq_state &= ~SQS_TMO_PROG;
-					sqp->sq_awaken = lbolt;
-					sqp->sq_tid = tid;
-				} else if (sqp->sq_state & SQS_TMO_PROG) {
-					/* timeout not needed */
-					sqp->sq_state &= ~SQS_TMO_PROG;
-					mutex_exit(&(sqp)->sq_lock);
-					(void) untimeout(tid);
-					mutex_enter(&sqp->sq_lock);
-				}
 			}
-			CALLB_CPR_SAFE_BEGIN(&cprinfo);
-			cv_wait(async, lock);
-			CALLB_CPR_SAFE_END(&cprinfo, lock);
-		}
-
 
-#if SQUEUE_PROFILE
-		if (SQ_PROFILING(sqp)) {
-			SQDELTA(sqp, sq_time_worker, gethrtime() - start);
+			sqp->sq_awaken = lbolt;
+			/*
+			 * Put the SQS_PROC_HELD on so the worker
+			 * thread can distinguish where its called from. We
+			 * can remove the SQS_PROC flag here and turn off the
+			 * polling so that it wouldn't matter who gets the
+			 * processing but we get better performance this way
+			 * and save the cost of turn polling off and possibly
+			 * on again as soon as we start draining again.
+			 *
+			 * We can't remove the SQS_PROC flag without turning
+			 * polling off until we can guarantee that control
+			 * will return to squeue_drain immediately.
+			 */
+			sqp->sq_state |= SQS_PROC_HELD;
+			sqp->sq_state &= ~SQS_GET_PKTS;
+			cv_signal(&sqp->sq_worker_cv);
+		} else if (sqp->sq_first == NULL &&
+		    !(sqp->sq_state & SQS_WORKER)) {
+			/*
+			 * Nothing queued and worker thread not running.
+			 * Since we hold the proc, no other thread is
+			 * processing the squeue. This means that there
+			 * is no work to be done and nothing is queued
+			 * in squeue or in NIC. Turn polling off and go
+			 * back to interrupt mode.
+			 */
+			sqp->sq_state &= ~(SQS_PROC|SQS_GET_PKTS);
+			/* LINTED: constant in conditional context */
+			SQS_POLLING_OFF(sqp, B_TRUE, sq_rx_ring);
+		} else {
+			/*
+			 * Worker thread is already running. We don't need
+			 * to do anything. Indicate that poll thread is done.
+			 */
+			sqp->sq_state &= ~SQS_GET_PKTS;
+		}
+		if (sqp->sq_state & SQS_POLL_THR_CONTROL) {
+			/*
+			 * Act on control requests to quiesce, cleanup or
+			 * restart an squeue
+			 */
+			squeue_poll_thr_control(sqp);
 		}
-#endif
 	}
 }
 
-#if SQUEUE_PROFILE
-static int
-squeue_kstat_update(kstat_t *ksp, int rw)
+/*
+ * The squeue worker thread acts on any control requests to quiesce, cleanup
+ * or restart an ill_rx_ring_t by calling this function. The worker thread
+ * synchronizes with the squeue poll thread to complete the request and finally
+ * wakes up the requestor when the request is completed.
+ */
+static void
+squeue_worker_thr_control(squeue_t *sqp)
 {
-	struct squeue_kstat *sqsp = &squeue_kstat;
-	squeue_t *sqp = ksp->ks_private;
+	ill_t	*ill;
+	ill_rx_ring_t	*rx_ring;
 
-	if (rw == KSTAT_WRITE)
-		return (EACCES);
+	ASSERT(MUTEX_HELD(&sqp->sq_lock));
 
-#if SQUEUE_DEBUG
-	sqsp->sq_count.value.ui64 = sqp->sq_count;
-	sqsp->sq_max_qlen.value.ui64 = sqp->sq_stats.sq_max_qlen;
-#endif
-	sqsp->sq_npackets_worker.value.ui64 = sqp->sq_stats.sq_npackets_worker;
-	sqsp->sq_npackets_intr.value.ui64 = sqp->sq_stats.sq_npackets_intr;
-	sqsp->sq_npackets_other.value.ui64 = sqp->sq_stats.sq_npackets_other;
-	sqsp->sq_nqueued_intr.value.ui64 = sqp->sq_stats.sq_nqueued_intr;
-	sqsp->sq_nqueued_other.value.ui64 = sqp->sq_stats.sq_nqueued_other;
-	sqsp->sq_ndrains_worker.value.ui64 = sqp->sq_stats.sq_ndrains_worker;
-	sqsp->sq_ndrains_intr.value.ui64 = sqp->sq_stats.sq_ndrains_intr;
-	sqsp->sq_ndrains_other.value.ui64 = sqp->sq_stats.sq_ndrains_other;
-	sqsp->sq_time_worker.value.ui64 = sqp->sq_stats.sq_time_worker;
-	sqsp->sq_time_intr.value.ui64 = sqp->sq_stats.sq_time_intr;
-	sqsp->sq_time_other.value.ui64 = sqp->sq_stats.sq_time_other;
-	return (0);
-}
-#endif
+	if (sqp->sq_state & SQS_POLL_RESTART) {
+		/* Restart implies a previous quiesce. */
+		ASSERT((sqp->sq_state & (SQS_PROC_HELD |
+		    SQS_POLL_QUIESCE_DONE | SQS_PROC | SQS_WORKER)) ==
+		    (SQS_POLL_QUIESCE_DONE | SQS_PROC | SQS_WORKER));
+		/*
+		 * Request the squeue poll thread to restart and wait till
+		 * it actually restarts.
+		 */
+		sqp->sq_state &= ~SQS_POLL_QUIESCE_DONE;
+		sqp->sq_state |= SQS_POLL_THR_RESTART;
+		cv_signal(&sqp->sq_poll_cv);
+		while (sqp->sq_state & SQS_POLL_THR_QUIESCED)
+			cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock);
+		sqp->sq_state &= ~(SQS_POLL_RESTART | SQS_PROC |
+		    SQS_WORKER);
+		/*
+		 * Signal any waiter that is waiting for the restart
+		 * to complete
+		 */
+		sqp->sq_state |= SQS_POLL_RESTART_DONE;
+		cv_signal(&sqp->sq_ctrlop_done_cv);
+		return;
+	}
 
-void
-squeue_profile_enable(squeue_t *sqp)
-{
-	mutex_enter(&sqp->sq_lock);
-	sqp->sq_state |= SQS_PROFILE;
-	mutex_exit(&sqp->sq_lock);
-}
+	if (sqp->sq_state & SQS_PROC_HELD) {
+		/* The squeue poll thread handed control to us */
+		ASSERT(sqp->sq_state & SQS_PROC);
+	}
 
-void
-squeue_profile_disable(squeue_t *sqp)
-{
-	mutex_enter(&sqp->sq_lock);
-	sqp->sq_state &= ~SQS_PROFILE;
+	/*
+	 * Prevent any other thread from processing the squeue
+	 * until we finish the control actions by setting SQS_PROC.
+	 * But allow ourself to reenter by setting SQS_WORKER
+	 */
+	sqp->sq_state |= (SQS_PROC | SQS_WORKER);
+
+	/* Signal the squeue poll thread and wait for it to quiesce itself */
+	if (!(sqp->sq_state & SQS_POLL_THR_QUIESCED)) {
+		sqp->sq_state |= SQS_POLL_THR_QUIESCE;
+		cv_signal(&sqp->sq_poll_cv);
+		while (!(sqp->sq_state & SQS_POLL_THR_QUIESCED))
+			cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock);
+	}
+
+	rx_ring = sqp->sq_rx_ring;
+	ill = rx_ring->rr_ill;
+	/*
+	 * The lock hierarchy is as follows.
+	 * cpu_lock -> ill_lock -> sqset_lock -> sq_lock
+	 */
 	mutex_exit(&sqp->sq_lock);
-}
+	mutex_enter(&ill->ill_lock);
+	mutex_enter(&sqp->sq_lock);
 
-void
-squeue_profile_reset(squeue_t *sqp)
-{
-#if SQUEUE_PROFILE
-	bzero(&sqp->sq_stats, sizeof (sqstat_t));
-#endif
-}
+	SQS_POLLING_OFF(sqp, (sqp->sq_state & SQS_POLL_CAPAB) != 0,
+	    sqp->sq_rx_ring);
+	sqp->sq_state &= ~(SQS_POLL_CAPAB | SQS_GET_PKTS | SQS_PROC_HELD);
+	if (sqp->sq_state & SQS_POLL_CLEANUP) {
+		/*
+		 * Disassociate this squeue from its ill_rx_ring_t.
+		 * The rr_sqp, sq_rx_ring fields are protected by the
+		 * corresponding squeue, ill_lock* and sq_lock. Holding any
+		 * of them will ensure that the ring to squeue mapping does
+		 * not change.
+		 */
+		ASSERT(!(sqp->sq_state & SQS_DEFAULT));
 
-void
-squeue_profile_start(void)
-{
-#if SQUEUE_PROFILE
-	squeue_profile = B_TRUE;
-#endif
+		sqp->sq_rx_ring = NULL;
+		rx_ring->rr_sqp = NULL;
+
+		sqp->sq_state &= ~(SQS_POLL_CLEANUP | SQS_POLL_THR_QUIESCED |
+		    SQS_POLL_QUIESCE_DONE);
+		sqp->sq_ill = NULL;
+
+		rx_ring->rr_rx_handle = NULL;
+		rx_ring->rr_intr_handle = NULL;
+		rx_ring->rr_intr_enable = NULL;
+		rx_ring->rr_intr_disable = NULL;
+		sqp->sq_state |= SQS_POLL_CLEANUP_DONE;
+	} else {
+		sqp->sq_state &= ~SQS_POLL_QUIESCE;
+		sqp->sq_state |= SQS_POLL_QUIESCE_DONE;
+	}
+	/*
+	 * Signal any waiter that is waiting for the quiesce or cleanup
+	 * to complete and also wait for it to actually see and reset the
+	 * SQS_POLL_CLEANUP_DONE.
+	 */
+	cv_signal(&sqp->sq_ctrlop_done_cv);
+	mutex_exit(&ill->ill_lock);
+	if (sqp->sq_state & SQS_POLL_CLEANUP_DONE) {
+		cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock);
+		sqp->sq_state &= ~(SQS_PROC | SQS_WORKER);
+	}
 }
 
-void
-squeue_profile_stop(void)
+static void
+squeue_worker(squeue_t *sqp)
 {
-#if SQUEUE_PROFILE
-	squeue_profile = B_FALSE;
-#endif
+	kmutex_t *lock = &sqp->sq_lock;
+	kcondvar_t *async = &sqp->sq_worker_cv;
+	callb_cpr_t cprinfo;
+	hrtime_t now;
+
+	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "sq_worker");
+	mutex_enter(lock);
+
+	for (;;) {
+		for (;;) {
+			/*
+			 * If the poll thread has handed control to us
+			 * we need to break out of the wait.
+			 */
+			if (sqp->sq_state & SQS_PROC_HELD)
+				break;
+
+			/*
+			 * If the squeue is not being processed and we either
+			 * have messages to drain or some thread has signaled
+			 * some control activity we need to break
+			 */
+			if (!(sqp->sq_state & SQS_PROC) &&
+			    ((sqp->sq_state & SQS_WORKER_THR_CONTROL) ||
+			    (sqp->sq_first != NULL)))
+				break;
+
+			/*
+			 * If we have started some control action, then check
+			 * for the SQS_WORKER flag (since we don't
+			 * release the squeue) to make sure we own the squeue
+			 * and break out
+			 */
+			if ((sqp->sq_state & SQS_WORKER_THR_CONTROL) &&
+			    (sqp->sq_state & SQS_WORKER))
+				break;
+
+			CALLB_CPR_SAFE_BEGIN(&cprinfo);
+			cv_wait(async, lock);
+			CALLB_CPR_SAFE_END(&cprinfo, lock);
+		}
+		if (sqp->sq_state & SQS_WORKER_THR_CONTROL) {
+			squeue_worker_thr_control(sqp);
+			continue;
+		}
+		ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+		    SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
+		    SQS_WORKER_THR_CONTROL | SQS_POLL_THR_CONTROL)));
+
+		if (sqp->sq_state & SQS_PROC_HELD)
+			sqp->sq_state &= ~SQS_PROC_HELD;
+
+		now = gethrtime();
+		sqp->sq_run = curthread;
+		sqp->sq_drain(sqp, SQS_WORKER, now +  squeue_drain_ns);
+		sqp->sq_run = NULL;
+	}
 }
 
 uintptr_t *
@@ -1482,9 +1240,3 @@ squeue_getprivate(squeue_t *sqp, sqprivate_t p)
 
 	return (&sqp->sq_private[p]);
 }
-
-processorid_t
-squeue_binding(squeue_t *sqp)
-{
-	return (sqp->sq_bind);
-}
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 3b8440b230..4bb50d2344 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -96,6 +96,7 @@
 #include <inet/ip_if.h>
 #include <inet/ipp_common.h>
 #include <inet/ip_netinfo.h>
+#include <sys/squeue_impl.h>
 #include <sys/squeue.h>
 #include <inet/kssl/ksslapi.h>
 #include <sys/tsol/label.h>
@@ -124,8 +125,8 @@
  * The tcp data structure does not use any kind of lock for protecting
  * its state but instead uses 'squeues' for mutual exclusion from various
  * read and write side threads. To access a tcp member, the thread should
- * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or
- * squeue_fill). Since the squeues allow a direct function call, caller
+ * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS,
+ * or SQ_NODRAIN). Since the squeues allow a direct function call, caller
  * can pass any tcp function having prototype of edesc_t as argument
  * (different from traditional STREAMs model where packets come in only
  * designated entry points). The list of functions that can be directly
@@ -251,15 +252,12 @@
 
 /*
  * Values for squeue switch:
- * 1: squeue_enter_nodrain
- * 2: squeue_enter
- * 3: squeue_fill
+ * 1: SQ_NODRAIN
+ * 2: SQ_PROCESS
+ * 3: SQ_FILL
  */
-int tcp_squeue_close = 2;	/* Setable in /etc/system */
-int tcp_squeue_wput = 2;
-
-squeue_func_t tcp_squeue_close_proc;
-squeue_func_t tcp_squeue_wput_proc;
+int tcp_squeue_wput = 2;	/* /etc/systems */
+int tcp_squeue_flag;
 
 /*
  * Macros for sodirect:
@@ -940,7 +938,7 @@ static int	tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
 			tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
 static int	tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
 			tcph_t *tcph, mblk_t *idmp);
-static squeue_func_t tcp_squeue_switch(int);
+static int	tcp_squeue_switch(int);
 
 static int	tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
 static int	tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
@@ -1865,9 +1863,9 @@ tcp_time_wait_collector(void *arg)
 
 				TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
 				mp = &tcp->tcp_closemp;
-				squeue_fill(connp->conn_sqp, mp,
+				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
 				    tcp_timewait_output, connp,
-				    SQTAG_TCP_TIMEWAIT);
+				    SQ_FILL, SQTAG_TCP_TIMEWAIT);
 			}
 		} else {
 			mutex_enter(&connp->conn_lock);
@@ -1893,8 +1891,9 @@ tcp_time_wait_collector(void *arg)
 
 			TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
 			mp = &tcp->tcp_closemp;
-			squeue_fill(connp->conn_sqp, mp,
-			    tcp_timewait_output, connp, 0);
+			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+			    tcp_timewait_output, connp,
+			    SQ_FILL, SQTAG_TCP_TIMEWAIT);
 		}
 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
 	}
@@ -2374,10 +2373,10 @@ finish:
 	 * queue.
 	 */
 	/*
-	 * We already have a ref on tcp so no need to do one before squeue_fill
+	 * We already have a ref on tcp so no need to do one before squeue_enter
 	 */
-	squeue_fill(eager->tcp_connp->conn_sqp, opt_mp,
-	    tcp_accept_finish, eager->tcp_connp, SQTAG_TCP_ACCEPT_FINISH);
+	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, opt_mp, tcp_accept_finish,
+	    eager->tcp_connp, SQ_FILL, SQTAG_TCP_ACCEPT_FINISH);
 }
 
 /*
@@ -4048,8 +4047,8 @@ tcp_close(queue_t *q, int flags)
 
 	TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
 
-	(*tcp_squeue_close_proc)(connp->conn_sqp, mp,
-	    tcp_close_output, connp, SQTAG_IP_TCP_CLOSE);
+	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp,
+	    tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
 
 	mutex_enter(&tcp->tcp_closelock);
 	while (!tcp->tcp_closed) {
@@ -4074,9 +4073,9 @@ tcp_close(queue_t *q, int flags)
 				/* Entering squeue, bump ref count. */
 				CONN_INC_REF(connp);
 				bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
-				squeue_enter(connp->conn_sqp, bp,
+				SQUEUE_ENTER_ONE(connp->conn_sqp, bp,
 				    tcp_linger_interrupted, connp,
-				    SQTAG_IP_TCP_CLOSE);
+				    tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
 				mutex_enter(&tcp->tcp_closelock);
 			}
 			break;
@@ -4625,6 +4624,11 @@ tcp_free(tcp_t *tcp)
 		tcp->tcp_ordrel_mp = NULL;
 	}
 
+	if (tcp->tcp_ordrel_mp != NULL) {
+		freeb(tcp->tcp_ordrel_mp);
+		tcp->tcp_ordrel_mp = NULL;
+	}
+
 	if (tcp->tcp_sack_info != NULL) {
 		if (tcp->tcp_notsack_list != NULL) {
 			TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list);
@@ -4825,8 +4829,9 @@ tcp_drop_q0(tcp_t *tcp)
 
 	/* Mark the IRE created for this SYN request temporary */
 	tcp_ip_ire_mark_advice(eager);
-	squeue_fill(eager->tcp_connp->conn_sqp, mp,
-	    tcp_clean_death_wrapper, eager->tcp_connp, SQTAG_TCP_DROP_Q0);
+	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
+	    tcp_clean_death_wrapper, eager->tcp_connp,
+	    SQ_FILL, SQTAG_TCP_DROP_Q0);
 
 	return (B_TRUE);
 }
@@ -5302,6 +5307,7 @@ tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp)
 	 * The caller already ensured that there is a sqp present.
 	 */
 	econnp->conn_sqp = new_sqp;
+	econnp->conn_initial_sqp = new_sqp;
 
 	if (connp->conn_policy != NULL) {
 		ipsec_in_t *ii;
@@ -5681,6 +5687,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
 			goto error2;
 		ASSERT(econnp->conn_netstack == connp->conn_netstack);
 		econnp->conn_sqp = new_sqp;
+		econnp->conn_initial_sqp = new_sqp;
 	} else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) {
 		/*
 		 * mp is updated in tcp_get_ipsec_conn().
@@ -6032,8 +6039,9 @@ error:
 	freemsg(mp1);
 	eager->tcp_closemp_used = B_TRUE;
 	TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
-	squeue_fill(econnp->conn_sqp, &eager->tcp_closemp, tcp_eager_kill,
-	    econnp, SQTAG_TCP_CONN_REQ_2);
+	mp1 = &eager->tcp_closemp;
+	SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill,
+	    econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_2);
 
 	/*
 	 * If a connection already exists, send the mp to that connections so
@@ -6056,8 +6064,8 @@ error:
 			CONN_DEC_REF(econnp);
 			freemsg(mp);
 		} else {
-			squeue_fill(econnp->conn_sqp, mp, tcp_input,
-			    econnp, SQTAG_TCP_CONN_REQ_1);
+			SQUEUE_ENTER_ONE(econnp->conn_sqp, mp,
+			    tcp_input, econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_1);
 		}
 	} else {
 		/* Nobody wants this packet */
@@ -6149,8 +6157,8 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
 done:
 	if (connp->conn_sqp != sqp) {
 		CONN_INC_REF(connp);
-		squeue_fill(connp->conn_sqp, mp,
-		    connp->conn_recv, connp, SQTAG_TCP_CONN_REQ_UNBOUND);
+		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
+		    SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND);
 	} else {
 		tcp_conn_request(connp, mp, sqp);
 	}
@@ -7217,8 +7225,8 @@ tcp_eager_blowoff(tcp_t	*listener, t_scalar_t seqnum)
 	CONN_INC_REF(eager->tcp_connp);
 	mutex_exit(&listener->tcp_eager_lock);
 	mp = &eager->tcp_closemp;
-	squeue_fill(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
-	    eager->tcp_connp, SQTAG_TCP_EAGER_BLOWOFF);
+	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
+	    eager->tcp_connp, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF);
 	return (B_TRUE);
 }
 
@@ -7245,9 +7253,9 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
 				TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
 				CONN_INC_REF(eager->tcp_connp);
 				mp = &eager->tcp_closemp;
-				squeue_fill(eager->tcp_connp->conn_sqp, mp,
+				SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
 				    tcp_eager_kill, eager->tcp_connp,
-				    SQTAG_TCP_EAGER_CLEANUP);
+				    SQ_FILL, SQTAG_TCP_EAGER_CLEANUP);
 			}
 			eager = eager->tcp_eager_next_q;
 		}
@@ -7261,8 +7269,8 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
 			TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
 			CONN_INC_REF(eager->tcp_connp);
 			mp = &eager->tcp_closemp;
-			squeue_fill(eager->tcp_connp->conn_sqp, mp,
-			    tcp_eager_kill, eager->tcp_connp,
+			SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
+			    tcp_eager_kill, eager->tcp_connp, SQ_FILL,
 			    SQTAG_TCP_EAGER_CLEANUP_Q0);
 		}
 		eager = eager->tcp_eager_next_q0;
@@ -9785,6 +9793,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
 		return (ENOSR);
 	}
 	connp->conn_sqp = IP_SQUEUE_GET(lbolt);
+	connp->conn_initial_sqp = connp->conn_sqp;
 	tcp = connp->conn_tcp;
 
 	q->q_ptr = WR(q)->q_ptr = connp;
@@ -12059,13 +12068,13 @@ enq:
  * on the conn structure associated so the tcp is guaranteed to exist
  * when we come here. We still need to check the state because it might
  * as well has been closed. The squeue processing function i.e. squeue_enter,
- * squeue_enter_nodrain, or squeue_drain is responsible for doing the
- * CONN_DEC_REF.
+ * is responsible for doing the CONN_DEC_REF.
  *
  * Apart from the default entry point, IP also sends packets directly to
  * tcp_rput_data for AF_INET fast path and tcp_conn_request for incoming
  * connections.
  */
+boolean_t tcp_outbound_squeue_switch = B_FALSE;
 void
 tcp_input(void *arg, mblk_t *mp, void *arg2)
 {
@@ -12102,10 +12111,33 @@ tcp_input(void *arg, mblk_t *mp, void *arg2)
 		return;
 	}
 
-	if (DB_TYPE(mp) == M_DATA)
-		tcp_rput_data(connp, mp, arg2);
-	else
+	if (DB_TYPE(mp) != M_DATA) {
 		tcp_rput_common(tcp, mp);
+		return;
+	}
+
+	if (mp->b_datap->db_struioflag & STRUIO_CONNECT) {
+		squeue_t	*final_sqp;
+
+		mp->b_datap->db_struioflag &= ~STRUIO_CONNECT;
+		final_sqp = (squeue_t *)DB_CKSUMSTART(mp);
+		DB_CKSUMSTART(mp) = 0;
+		if (tcp->tcp_state == TCPS_SYN_SENT &&
+		    connp->conn_final_sqp == NULL &&
+		    tcp_outbound_squeue_switch) {
+			ASSERT(connp->conn_initial_sqp == connp->conn_sqp);
+			connp->conn_final_sqp = final_sqp;
+			if (connp->conn_final_sqp != connp->conn_sqp) {
+				CONN_INC_REF(connp);
+				SQUEUE_SWITCH(connp, connp->conn_final_sqp);
+				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+				    tcp_rput_data, connp, ip_squeue_flag,
+				    SQTAG_CONNECT_FINISH);
+				return;
+			}
+		}
+	}
+	tcp_rput_data(connp, mp, arg2);
 }
 
 /*
@@ -14316,16 +14348,27 @@ process_ack:
 			CONN_INC_REF(listener->tcp_connp);
 			if (listener->tcp_connp->conn_sqp ==
 			    connp->conn_sqp) {
+				/*
+				 * We optimize by not calling an SQUEUE_ENTER
+				 * on the listener since we know that the
+				 * listener and eager squeues are the same.
+				 * We are able to make this check safely only
+				 * because neither the eager nor the listener
+				 * can change its squeue. Only an active connect
+				 * can change its squeue
+				 */
 				tcp_send_conn_ind(listener->tcp_connp, mp,
 				    listener->tcp_connp->conn_sqp);
 				CONN_DEC_REF(listener->tcp_connp);
 			} else if (!tcp->tcp_loopback) {
-				squeue_fill(listener->tcp_connp->conn_sqp, mp,
-				    tcp_send_conn_ind,
-				    listener->tcp_connp, SQTAG_TCP_CONN_IND);
+				SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
+				    mp, tcp_send_conn_ind,
+				    listener->tcp_connp, SQ_FILL,
+				    SQTAG_TCP_CONN_IND);
 			} else {
-				squeue_enter(listener->tcp_connp->conn_sqp, mp,
-				    tcp_send_conn_ind, listener->tcp_connp,
+				SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
+				    mp, tcp_send_conn_ind,
+				    listener->tcp_connp, SQ_PROCESS,
 				    SQTAG_TCP_CONN_IND);
 			}
 		}
@@ -15884,7 +15927,6 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
 	return (mp);
 }
 
-
 /*
  * Handle a *T_BIND_REQ that has failed either due to a T_ERROR_ACK
  * or a "bad" IRE detected by tcp_adapt_ire.
@@ -16402,8 +16444,8 @@ tcp_rsrv(queue_t *q)
 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
 
 	CONN_INC_REF(connp);
-	squeue_enter(connp->conn_sqp, mp, tcp_rsrv_input, connp,
-	    SQTAG_TCP_RSRV);
+	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp,
+	    SQ_PROCESS, SQTAG_TCP_RSRV);
 }
 
 /*
@@ -18768,9 +18810,9 @@ tcp_wput_accept(queue_t *q, mblk_t *mp)
 
 			/* Need to get inside the listener perimeter */
 			CONN_INC_REF(listener->tcp_connp);
-			squeue_fill(listener->tcp_connp->conn_sqp, mp1,
+			SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1,
 			    tcp_send_pending, listener->tcp_connp,
-			    SQTAG_TCP_SEND_PENDING);
+			    SQ_FILL, SQTAG_TCP_SEND_PENDING);
 		}
 no_more_eagers:
 		tcp_eager_unlink(eager);
@@ -18781,10 +18823,13 @@ no_more_eagers:
 		 * but we still have an extra refs on eager (apart from the
 		 * usual tcp references). The ref was placed in tcp_rput_data
 		 * before sending the conn_ind in tcp_send_conn_ind.
-		 * The ref will be dropped in tcp_accept_finish().
+		 * The ref will be dropped in tcp_accept_finish(). As sockfs
+		 * has already established this tcp with it's own stream,
+		 * it's OK to set tcp_detached to B_FALSE.
 		 */
-		squeue_enter_nodrain(econnp->conn_sqp, opt_mp,
-		    tcp_accept_finish, econnp, SQTAG_TCP_ACCEPT_FINISH_Q0);
+		econnp->conn_tcp->tcp_detached = B_FALSE;
+		SQUEUE_ENTER_ONE(econnp->conn_sqp, opt_mp, tcp_accept_finish,
+		    econnp, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
 		return;
 	default:
 		mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0);
@@ -18916,7 +18961,6 @@ tcp_wput(queue_t *q, mblk_t *mp)
 	t_scalar_t type;
 	uchar_t *rptr;
 	struct iocblk	*iocp;
-	uint32_t	msize;
 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
 
 	ASSERT(connp->conn_ref >= 2);
@@ -18926,18 +18970,16 @@ tcp_wput(queue_t *q, mblk_t *mp)
 		tcp = connp->conn_tcp;
 		ASSERT(tcp != NULL);
 
-		msize = msgdsize(mp);
-
 		mutex_enter(&tcp->tcp_non_sq_lock);
-		tcp->tcp_squeue_bytes += msize;
+		tcp->tcp_squeue_bytes += msgdsize(mp);
 		if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
 			tcp_setqfull(tcp);
 		}
 		mutex_exit(&tcp->tcp_non_sq_lock);
 
 		CONN_INC_REF(connp);
-		(*tcp_squeue_wput_proc)(connp->conn_sqp, mp,
-		    tcp_output, connp, SQTAG_TCP_OUTPUT);
+		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
+		    tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 		return;
 
 	case M_CMD:
@@ -19030,8 +19072,8 @@ tcp_wput(queue_t *q, mblk_t *mp)
 	}
 
 	CONN_INC_REF(connp);
-	(*tcp_squeue_wput_proc)(connp->conn_sqp, mp,
-	    output_proc, connp, SQTAG_TCP_WPUT_OTHER);
+	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp,
+	    tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER);
 }
 
 /*
@@ -19503,34 +19545,27 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
 	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
 	    ntohs(ipha->ipha_length));
 
-	if (ILL_DLS_CAPABLE(ill)) {
-		/*
-		 * Send the packet directly to DLD, where it may be queued
-		 * depending on the availability of transmit resources at
-		 * the media layer.
-		 */
-		IP_DLS_ILL_TX(ill, ipha, mp, ipst, ire_fp_mp_len);
-	} else {
-		ill_t *out_ill = (ill_t *)ire->ire_stq->q_ptr;
-		DTRACE_PROBE4(ip4__physical__out__start,
-		    ill_t *, NULL, ill_t *, out_ill,
-		    ipha_t *, ipha, mblk_t *, mp);
-		FW_HOOKS(ipst->ips_ip4_physical_out_event,
-		    ipst->ips_ipv4firewall_physical_out,
-		    NULL, out_ill, ipha, mp, mp, 0, ipst);
-		DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+	DTRACE_PROBE4(ip4__physical__out__start,
+	    ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
+	FW_HOOKS(ipst->ips_ip4_physical_out_event,
+	    ipst->ips_ipv4firewall_physical_out,
+	    NULL, ill, ipha, mp, mp, 0, ipst);
+	DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+	DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL);
 
-		if (mp != NULL) {
-			if (ipst->ips_ipobs_enabled) {
-				ipobs_hook(mp, IPOBS_HOOK_OUTBOUND,
-				    IP_REAL_ZONEID(connp->conn_zoneid, ipst),
-				    ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len,
-				    ipst);
-			}
-			DTRACE_IP_FASTPATH(mp, ipha, out_ill, ipha, NULL);
-			putnext(ire->ire_stq, mp);
+	if (mp != NULL) {
+		if (ipst->ips_ipobs_enabled) {
+			zoneid_t szone;
+
+			szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
+			    ipst, ALL_ZONES);
+			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
+			    ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst);
 		}
+
+		ILL_SEND_TX(ill, ire, connp, mp, 0);
 	}
+
 	IRE_REFRELE(ire);
 }
 
@@ -21327,12 +21362,7 @@ tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head,
 	}
 
 	/* send it down */
-	if (ILL_DLS_CAPABLE(ill)) {
-		ill_dls_capab_t *ill_dls = ill->ill_dls_capab;
-		ill_dls->ill_tx(ill_dls->ill_tx_handle, md_mp_head);
-	} else {
-		putnext(ire->ire_stq, md_mp_head);
-	}
+	putnext(ire->ire_stq, md_mp_head);
 
 	/* we're done for TCP/IPv4 */
 	if (tcp->tcp_ipversion == IPV4_VERSION)
@@ -21478,10 +21508,12 @@ tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss,
 	    IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
 
 	/*
-	 * Append LSO flag to DB_LSOFLAGS(mp) and set the mss to DB_LSOMSS(mp).
+	 * Append LSO flags and mss to the mp.
 	 */
-	DB_LSOFLAGS(mp) |= HW_LSO;
-	DB_LSOMSS(mp) = mss;
+	lso_info_set(mp, mss, HW_LSO);
+
+	ipha->ipha_fragment_offset_and_flags |=
+	    (uint32_t)htons(ire->ire_frag_flag);
 
 	ire_fp_mp = ire->ire_nce->nce_fp_mp;
 	ire_fp_mp_len = MBLKL(ire_fp_mp);
@@ -21496,34 +21528,25 @@ tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss,
 	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
 	    ntohs(ipha->ipha_length));
 
-	if (ILL_DLS_CAPABLE(ill)) {
-		/*
-		 * Send the packet directly to DLD, where it may be queued
-		 * depending on the availability of transmit resources at
-		 * the media layer.
-		 */
-		IP_DLS_ILL_TX(ill, ipha, mp, ipst, ire_fp_mp_len);
-	} else {
-		ill_t *out_ill = (ill_t *)ire->ire_stq->q_ptr;
-		DTRACE_PROBE4(ip4__physical__out__start,
-		    ill_t *, NULL, ill_t *, out_ill,
-		    ipha_t *, ipha, mblk_t *, mp);
-		FW_HOOKS(ipst->ips_ip4_physical_out_event,
-		    ipst->ips_ipv4firewall_physical_out,
-		    NULL, out_ill, ipha, mp, mp, 0, ipst);
-		DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+	DTRACE_PROBE4(ip4__physical__out__start,
+	    ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
+	FW_HOOKS(ipst->ips_ip4_physical_out_event,
+	    ipst->ips_ipv4firewall_physical_out, NULL,
+	    ill, ipha, mp, mp, 0, ipst);
+	DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+	DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL);
 
-		if (mp != NULL) {
-			if (ipst->ips_ipobs_enabled) {
-				zoneid_t szone = tcp->tcp_connp->conn_zoneid;
+	if (mp != NULL) {
+		if (ipst->ips_ipobs_enabled) {
+			zoneid_t szone;
 
-				ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
-				    ALL_ZONES, ill, tcp->tcp_ipversion,
-				    ire_fp_mp_len, ipst);
-			}
-			DTRACE_IP_FASTPATH(mp, ipha, out_ill, ipha, NULL);
-			putnext(ire->ire_stq, mp);
+			szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
+			    ipst, ALL_ZONES);
+			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
+			    ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst);
 		}
+
+		ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0);
 	}
 }
 
@@ -24921,9 +24944,6 @@ tcp_ddi_g_init(void)
 	/* Initialize the random number generator */
 	tcp_random_init();
 
-	tcp_squeue_wput_proc = tcp_squeue_switch(tcp_squeue_wput);
-	tcp_squeue_close_proc = tcp_squeue_switch(tcp_squeue_close);
-
 	/* A single callback independently of how many netstacks we have */
 	ip_squeue_init(tcp_squeue_add);
 
@@ -24932,6 +24952,8 @@ tcp_ddi_g_init(void)
 	tcp_taskq = taskq_create("tcp_taskq", 1, minclsyspri, 1, 1,
 	    TASKQ_PREPOPULATE);
 
+	tcp_squeue_flag = tcp_squeue_switch(tcp_squeue_wput);
+
 	/*
 	 * We want to be informed each time a stack is created or
 	 * destroyed in the kernel, so we can maintain the
@@ -25420,7 +25442,7 @@ tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp)
 		 * If we get here, we are already on the correct
 		 * squeue. This ioctl follows the following path
 		 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn
-		 * ->tcp_ioctl_abort->squeue_fill (if on a
+		 * ->tcp_ioctl_abort->squeue_enter (if on a
 		 * different squeue)
 		 */
 		int errcode;
@@ -25487,8 +25509,8 @@ startover:
 		listhead = listhead->b_next;
 		tcp = (tcp_t *)mp->b_prev;
 		mp->b_next = mp->b_prev = NULL;
-		squeue_fill(tcp->tcp_connp->conn_sqp, mp,
-		    tcp_input, tcp->tcp_connp, SQTAG_TCP_ABORT_BUCKET);
+		SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, tcp_input,
+		    tcp->tcp_connp, SQ_FILL, SQTAG_TCP_ABORT_BUCKET);
 	}
 
 	*count += nmatch;
@@ -25989,8 +26011,8 @@ tcp_timer_callback(void *arg)
 
 	tcpt = (tcp_timer_t *)mp->b_rptr;
 	connp = tcpt->connp;
-	squeue_fill(connp->conn_sqp, mp,
-	    tcp_timer_handler, connp, SQTAG_TCP_TIMER);
+	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
+	    SQ_FILL, SQTAG_TCP_TIMER);
 }
 
 static void
@@ -26486,6 +26508,7 @@ tcp_kstat_update(kstat_t *kp, int rw)
 		netstack_rele(ns);
 		return (-1);
 	}
+
 	tcpkp = (tcp_named_kstat_t *)kp->ks_data;
 
 	tcpkp->currEstab.value.ui32 = 0;
@@ -26583,8 +26606,8 @@ tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp)
 	/* Already has an eager */
 	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
 		TCP_STAT(tcps, tcp_reinput_syn);
-		squeue_enter(connp->conn_sqp, mp, connp->conn_recv,
-		    connp, SQTAG_TCP_REINPUT_EAGER);
+		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
+		    SQ_PROCESS, SQTAG_TCP_REINPUT_EAGER);
 		return;
 	}
 
@@ -26609,21 +26632,21 @@ tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp)
 		DB_CKSUMSTART(mp) = (intptr_t)sqp;
 	}
 
-	squeue_fill(connp->conn_sqp, mp, connp->conn_recv, connp,
-	    SQTAG_TCP_REINPUT);
+	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
+	    SQ_FILL, SQTAG_TCP_REINPUT);
 }
 
-static squeue_func_t
+static int
 tcp_squeue_switch(int val)
 {
-	squeue_func_t rval = squeue_fill;
+	int rval = SQ_FILL;
 
 	switch (val) {
 	case 1:
-		rval = squeue_enter_nodrain;
+		rval = SQ_NODRAIN;
 		break;
 	case 2:
-		rval = squeue_enter;
+		rval = SQ_PROCESS;
 		break;
 	default:
 		break;
diff --git a/usr/src/uts/common/inet/tcp/tcp_kssl.c b/usr/src/uts/common/inet/tcp/tcp_kssl.c
index 0913da33f8..8eb8cddff3 100644
--- a/usr/src/uts/common/inet/tcp/tcp_kssl.c
+++ b/usr/src/uts/common/inet/tcp/tcp_kssl.c
@@ -53,6 +53,7 @@
 #include <inet/ipdrop.h>
 #include <inet/tcp_impl.h>
 
+#include <sys/squeue_impl.h>
 #include <sys/squeue.h>
 #include <inet/kssl/ksslapi.h>
 
@@ -70,7 +71,7 @@ static void	tcp_kssl_input_asynch(void *, mblk_t *, void *);
 extern void	tcp_output(void *, mblk_t *, void *);
 extern void	tcp_send_conn_ind(void *, mblk_t *, void *);
 
-extern squeue_func_t tcp_squeue_wput_proc;
+extern int tcp_squeue_flag;
 
 /*
  * tcp_rput_data() calls this routine for all packet destined to a
@@ -205,10 +206,10 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
 					    listener->tcp_connp->conn_sqp);
 					CONN_DEC_REF(listener->tcp_connp);
 				} else {
-					squeue_fill(
+					SQUEUE_ENTER_ONE(
 					    listener->tcp_connp->conn_sqp,
 					    ind_mp, tcp_send_conn_ind,
-					    listener->tcp_connp,
+					    listener->tcp_connp, SQ_FILL,
 					    SQTAG_TCP_CONN_IND);
 				}
 			}
@@ -294,11 +295,11 @@ no_can_do:
 					    listener->tcp_connp->conn_sqp);
 					CONN_DEC_REF(listener->tcp_connp);
 				} else {
-					squeue_fill(
+					SQUEUE_ENTER_ONE(
 					    listener->tcp_connp->conn_sqp,
 					    ind_mp, tcp_send_conn_ind,
 					    listener->tcp_connp,
-					    SQTAG_TCP_CONN_IND);
+					    SQ_FILL, SQTAG_TCP_CONN_IND);
 				}
 			}
 			if (mp != NULL)
@@ -343,8 +344,8 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd)
 			mutex_exit(&tcp->tcp_non_sq_lock);
 		}
 		CONN_INC_REF(connp);
-		(*tcp_squeue_wput_proc)(connp->conn_sqp, mp,
-		    tcp_output, connp, SQTAG_TCP_OUTPUT);
+		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
+		    tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 
 	/* FALLTHROUGH */
 	case KSSL_CMD_NONE:
@@ -375,8 +376,8 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd)
 	 */
 	if ((sqmp = allocb(1, BPRI_MED)) != NULL) {
 		CONN_INC_REF(connp);
-		squeue_fill(connp->conn_sqp, sqmp, tcp_kssl_input_asynch,
-		    connp, SQTAG_TCP_KSSL_INPUT);
+		SQUEUE_ENTER_ONE(connp->conn_sqp, sqmp, tcp_kssl_input_asynch,
+		    connp, SQ_FILL, SQTAG_TCP_KSSL_INPUT);
 	} else {
 		DTRACE_PROBE(kssl_err__allocb_failed);
 	}
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 3369ca915e..70677c86d8 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -78,6 +78,7 @@
 #include <inet/ipclassifier.h>
 #include <inet/ipsec_impl.h>
 #include <inet/ipp_common.h>
+#include <sys/squeue_impl.h>
 #include <inet/ipnet.h>
 
 /*
@@ -196,14 +197,15 @@ static int	udp_rinfop(queue_t *q, infod_t *dp);
 static int	udp_rrw(queue_t *q, struiod_t *dp);
 static int	udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
 		    cred_t *cr);
-static void	udp_send_data(udp_t *, queue_t *, mblk_t *, ipha_t *);
+static void	udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp,
+		    ipha_t *ipha);
 static void	udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr,
 		    t_scalar_t destlen, t_scalar_t err);
 static void	udp_unbind(queue_t *q, mblk_t *mp);
 static in_port_t udp_update_next_port(udp_t *udp, in_port_t port,
     boolean_t random);
 static mblk_t	*udp_output_v4(conn_t *, mblk_t *, ipaddr_t, uint16_t, uint_t,
-		    int *, boolean_t);
+    int *, boolean_t);
 static mblk_t	*udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6,
 		    int *error);
 static void	udp_wput_other(queue_t *q, mblk_t *mp);
@@ -4401,6 +4403,7 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
 			UDP_STAT(us, udp_in_recvucred);
 		}
 
+		/* XXX FIXME: apply to AF_INET6 as well */
 		/*
 		 * If SO_TIMESTAMP is set allocate the appropriate sized
 		 * buffer. Since gethrestime() expects a pointer aligned
@@ -6237,8 +6240,12 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
 	dev_q = ire->ire_stq->q_next;
 	ASSERT(dev_q != NULL);
 
+	ill = ire_to_ill(ire);
+	ASSERT(ill != NULL);
 
-	if (DEV_Q_IS_FLOW_CTLED(dev_q)) {
+	/* is queue flow controlled? */
+	if (q->q_first != NULL || connp->conn_draining ||
+	    DEV_Q_FLOW_BLOCKED(dev_q)) {
 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 		if (ipst->ips_ip_output_queue)
@@ -6256,8 +6263,6 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
 	dst = ipha->ipha_dst;
 	src = ipha->ipha_src;
 
-	ill = ire_to_ill(ire);
-	ASSERT(ill != NULL);
 
 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
 
@@ -6334,31 +6339,32 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
 	UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
 	    ntohs(ipha->ipha_length));
 
-	if (ILL_DLS_CAPABLE(ill)) {
-		/*
-		 * Send the packet directly to DLD, where it may be queued
-		 * depending on the availability of transmit resources at
-		 * the media layer.
-		 */
-		IP_DLS_ILL_TX(ill, ipha, mp, ipst, ire_fp_mp_len);
-	} else {
-		DTRACE_PROBE4(ip4__physical__out__start,
-		    ill_t *, NULL, ill_t *, ill,
-		    ipha_t *, ipha, mblk_t *, mp);
-		FW_HOOKS(ipst->ips_ip4_physical_out_event,
-		    ipst->ips_ipv4firewall_physical_out,
-		    NULL, ill, ipha, mp, mp, ll_multicast, ipst);
-		DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
-		if (mp != NULL) {
-			if (ipst->ips_ipobs_enabled) {
-				ipobs_hook(mp, IPOBS_HOOK_OUTBOUND,
-				    IP_REAL_ZONEID(connp->conn_zoneid, ipst),
-				    ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len,
-				    ipst);
-			}
-			DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
-			    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,
-			    ipha_t *, ipha, ip6_t *, NULL, int, 0);
+	DTRACE_PROBE4(ip4__physical__out__start,
+	    ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
+	FW_HOOKS(ipst->ips_ip4_physical_out_event,
+	    ipst->ips_ipv4firewall_physical_out, NULL, ill, ipha, mp, mp,
+	    ll_multicast, ipst);
+	DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
+	if (ipst->ips_ipobs_enabled && mp != NULL) {
+		zoneid_t szone;
+
+		szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
+		    ipst, ALL_ZONES);
+		ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
+		    ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst);
+	}
+
+	if (mp != NULL) {
+		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
+		    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,
+		    ipha_t *, ipha, ip6_t *, NULL, int, 0);
+
+		if (ILL_DIRECT_CAPABLE(ill)) {
+			ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct;
+
+			(void) idd->idd_tx_df(idd->idd_tx_dh, mp,
+			    (uintptr_t)connp, 0);
+		} else {
 			putnext(ire->ire_stq, mp);
 		}
 	}
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 04b8dbc22c..468fa553f4 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -26,8 +26,6 @@
 #ifndef	_UDP_IMPL_H
 #define	_UDP_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * UDP implementation private declarations.  These interfaces are
  * used to build the IP module and are not meant to be accessed
@@ -159,7 +157,7 @@ typedef struct udp_fanout_s {
  * below IP and if the q_first is NULL, we optimize by not doing
  * the canput check
  */
-#define	DEV_Q_IS_FLOW_CTLED(dev_q)					\
+#define	DEV_Q_FLOW_BLOCKED(dev_q)					\
 	(((dev_q)->q_next != NULL || (dev_q)->q_first != NULL) &&	\
 	!canput(dev_q))
 
@@ -371,9 +369,7 @@ extern void	udp_quiesce_conn(conn_t *);
 extern void	udp_ddi_init(void);
 extern void	udp_ddi_destroy(void);
 extern void	udp_resume_bind(conn_t *, mblk_t *);
-extern void	udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr,
-		    socklen_t addrlen);
-extern void	udp_wput(queue_t *, mblk_t *);
+extern	void	udp_wput(queue_t *, mblk_t *);
 
 extern int	udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
     uchar_t *ptr);
diff --git a/usr/src/uts/common/io/afe/afe.c b/usr/src/uts/common/io/afe/afe.c
index a89926f58f..9f32d0d3f8 100644
--- a/usr/src/uts/common/io/afe/afe.c
+++ b/usr/src/uts/common/io/afe/afe.c
@@ -184,7 +184,6 @@ static mac_callbacks_t afe_m_callbacks = {
 	afe_m_multicst,
 	afe_m_unicst,
 	afe_m_tx,
-	NULL,		/* mc_resources */
 	NULL,		/* mc_ioctl */
 	NULL,		/* mc_getcapab */
 	NULL,		/* mc_open */
diff --git a/usr/src/uts/common/io/afe/afeimpl.h b/usr/src/uts/common/io/afe/afeimpl.h
index 0dccbe1acd..2b2e0c237d 100644
--- a/usr/src/uts/common/io/afe/afeimpl.h
+++ b/usr/src/uts/common/io/afe/afeimpl.h
@@ -36,10 +36,10 @@
 #ifndef	_AFEIMPL_H
 #define	_AFEIMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	_KERNEL
 
+#include	<sys/mac_provider.h>
+
 /*
  * Compile time tunables.
  */
diff --git a/usr/src/uts/common/io/aggr/aggr_ctl.c b/usr/src/uts/common/io/aggr/aggr_ctl.c
index 0cfb177ed6..ea167fda28 100644
--- a/usr/src/uts/common/io/aggr/aggr_ctl.c
+++ b/usr/src/uts/common/io/aggr/aggr_ctl.c
@@ -29,13 +29,14 @@
 
 #include <sys/aggr.h>
 #include <sys/aggr_impl.h>
+#include <sys/priv_names.h>
 
 /*
  * Process a LAIOC_MODIFY request.
  */
 /* ARGSUSED */
 static int
-aggr_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred)
+aggr_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	laioc_modify_t *modify_arg = karg;
 	uint32_t policy;
@@ -68,8 +69,8 @@ aggr_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred)
 		lacp_timer = modify_arg->lu_lacp_timer;
 	}
 
-	return (aggr_grp_modify(modify_arg->lu_linkid, NULL, modify_mask,
-	    policy, mac_fixed, mac_addr, lacp_mode, lacp_timer));
+	return (aggr_grp_modify(modify_arg->lu_linkid, modify_mask, policy,
+	    mac_fixed, mac_addr, lacp_mode, lacp_timer));
 }
 
 /*
@@ -77,7 +78,7 @@ aggr_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred)
  */
 /* ARGSUSED */
 static int
-aggr_ioc_create(void *karg, intptr_t arg, int mode, cred_t *cred)
+aggr_ioc_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	laioc_create_t *create_arg = karg;
 	uint16_t nports;
@@ -122,7 +123,7 @@ done:
 
 /* ARGSUSED */
 static int
-aggr_ioc_delete(void *karg, intptr_t arg, int mode, cred_t *cred)
+aggr_ioc_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	laioc_delete_t *delete_arg = karg;
 
@@ -191,7 +192,7 @@ aggr_ioc_info_new_port(void *arg, datalink_id_t linkid, uchar_t *mac,
 
 /*ARGSUSED*/
 static int
-aggr_ioc_info(void *karg, intptr_t arg, int mode, cred_t *cred)
+aggr_ioc_info(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	laioc_info_t *info_argp = karg;
 	datalink_id_t linkid;
@@ -249,30 +250,31 @@ done:
 
 /* ARGSUSED */
 static int
-aggr_ioc_add(void *karg, intptr_t arg, int mode, cred_t *cred)
+aggr_ioc_add(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	return (aggr_ioc_add_remove(karg, arg, LAIOC_ADD, mode));
 }
 
 /* ARGSUSED */
 static int
-aggr_ioc_remove(void *karg, intptr_t arg, int mode, cred_t *cred)
+aggr_ioc_remove(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	return (aggr_ioc_add_remove(karg, arg, LAIOC_REMOVE, mode));
 }
 
 static dld_ioc_info_t aggr_ioc_list[] = {
-	{LAIOC_CREATE, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_create_t),
-	    aggr_ioc_create},
-	{LAIOC_DELETE, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_delete_t),
-	    aggr_ioc_delete},
-	{LAIOC_INFO, DLDCOPYINOUT, sizeof (laioc_info_t), aggr_ioc_info},
-	{LAIOC_ADD, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_add_rem_t),
-	    aggr_ioc_add},
-	{LAIOC_REMOVE, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_add_rem_t),
-	    aggr_ioc_remove},
-	{LAIOC_MODIFY, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_modify_t),
-	    aggr_ioc_modify}
+	{LAIOC_CREATE, DLDCOPYIN, sizeof (laioc_create_t), aggr_ioc_create,
+	    {PRIV_SYS_DL_CONFIG}},
+	{LAIOC_DELETE, DLDCOPYIN, sizeof (laioc_delete_t), aggr_ioc_delete,
+	    {PRIV_SYS_DL_CONFIG}},
+	{LAIOC_INFO, DLDCOPYINOUT, sizeof (laioc_info_t), aggr_ioc_info,
+	    {NULL}},
+	{LAIOC_ADD, DLDCOPYIN, sizeof (laioc_add_rem_t), aggr_ioc_add,
+	    {PRIV_SYS_DL_CONFIG}},
+	{LAIOC_REMOVE, DLDCOPYIN, sizeof (laioc_add_rem_t), aggr_ioc_remove,
+	    {PRIV_SYS_DL_CONFIG}},
+	{LAIOC_MODIFY, DLDCOPYIN, sizeof (laioc_modify_t), aggr_ioc_modify,
+	    {PRIV_SYS_DL_CONFIG}}
 };
 
 int
diff --git a/usr/src/uts/common/io/aggr/aggr_dev.c b/usr/src/uts/common/io/aggr/aggr_dev.c
index fc2c396c2b..6640015af5 100644
--- a/usr/src/uts/common/io/aggr/aggr_dev.c
+++ b/usr/src/uts/common/io/aggr/aggr_dev.c
@@ -42,38 +42,8 @@ static int aggr_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 static int aggr_attach(dev_info_t *, ddi_attach_cmd_t);
 static int aggr_detach(dev_info_t *, ddi_detach_cmd_t);
 
-static struct cb_ops aggr_cb_ops = {
-	nulldev,		/* open */
-	nulldev,		/* close */
-	nulldev,		/* strategy */
-	nulldev,		/* print */
-	nodev,			/* dump */
-	nodev,			/* read */
-	nodev,			/* write */
-	nodev,			/* ioctl */
-	nodev,			/* devmap */
-	nodev,			/* mmap */
-	nodev,			/* segmap */
-	nochpoll,		/* poll */
-	ddi_prop_op,		/* cb_prop_op */
-	0,			/* streamtab  */
-	D_MP			/* Driver compatibility flag */
-};
-
-static struct dev_ops aggr_dev_ops = {
-	DEVO_REV,		/* devo_rev */
-	0,			/* refcnt */
-	aggr_getinfo,		/* get_dev_info */
-	nulldev,		/* identify */
-	nulldev,		/* probe */
-	aggr_attach,		/* attach */
-	aggr_detach,		/* detach */
-	nodev,			/* reset */
-	&aggr_cb_ops,		/* driver operations */
-	NULL,			/* bus operations */
-	nodev,			/* dev power */
-	ddi_quiesce_not_supported,	/* dev quiesce */
-};
+DDI_DEFINE_STREAM_OPS(aggr_dev_ops, nulldev, nulldev, aggr_attach, aggr_detach,
+    nodev, aggr_getinfo, D_MP, NULL, ddi_quiesce_not_supported);
 
 static struct modldrv aggr_modldrv = {
 	&mod_driverops,		/* Type of module.  This one is a driver */
@@ -82,9 +52,7 @@ static struct modldrv aggr_modldrv = {
 };
 
 static struct modlinkage modlinkage = {
-	MODREV_1,
-	&aggr_modldrv,
-	NULL
+	MODREV_1, &aggr_modldrv, NULL
 };
 
 int
diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c
index cee6d5e45f..fa90087320 100644
--- a/usr/src/uts/common/io/aggr/aggr_grp.c
+++ b/usr/src/uts/common/io/aggr/aggr_grp.c
@@ -39,6 +39,7 @@
 #include <sys/sysmacros.h>
 #include <sys/conf.h>
 #include <sys/cmn_err.h>
+#include <sys/disp.h>
 #include <sys/list.h>
 #include <sys/ksynch.h>
 #include <sys/kmem.h>
@@ -52,6 +53,7 @@
 #include <sys/id_space.h>
 #include <sys/strsun.h>
 #include <sys/dlpi.h>
+#include <sys/mac_provider.h>
 #include <sys/dls.h>
 #include <sys/vlan.h>
 #include <sys/aggr.h>
@@ -63,7 +65,6 @@ static int aggr_m_promisc(void *, boolean_t);
 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
 static int aggr_m_unicst(void *, const uint8_t *);
 static int aggr_m_stat(void *, uint_t, uint64_t *);
-static void aggr_m_resources(void *);
 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
@@ -76,8 +77,20 @@ static uint_t aggr_grp_max_sdu(aggr_grp_t *);
 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
-static int aggr_grp_multicst(aggr_grp_t *grp, boolean_t add,
-    const uint8_t *addrp);
+
+static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
+static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
+static int aggr_pseudo_disable_intr(mac_intr_handle_t);
+static int aggr_pseudo_enable_intr(mac_intr_handle_t);
+static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
+static void aggr_pseudo_stop_ring(mac_ring_driver_t);
+static int aggr_addmac(void *, const uint8_t *);
+static int aggr_remmac(void *, const uint8_t *);
+static mblk_t *aggr_rx_poll(void *, int);
+static void aggr_fill_ring(void *, mac_ring_type_t, const int,
+    const int, mac_ring_info_t *, mac_ring_handle_t);
+static void aggr_fill_group(void *, mac_ring_type_t, const int,
+    mac_group_info_t *, mac_group_handle_t);
 
 static kmem_cache_t	*aggr_grp_cache;
 static mod_hash_t	*aggr_grp_hash;
@@ -87,10 +100,11 @@ static id_space_t	*key_ids;
 
 #define	GRP_HASHSZ		64
 #define	GRP_HASH_KEY(linkid)	((mod_hash_key_t)(uintptr_t)linkid)
+#define	AGGR_PORT_NAME_DELIMIT '-'
 
 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
 
-#define	AGGR_M_CALLBACK_FLAGS	(MC_RESOURCES | MC_IOCTL | MC_GETCAPAB)
+#define	AGGR_M_CALLBACK_FLAGS	(MC_IOCTL | MC_GETCAPAB)
 
 static mac_callbacks_t aggr_m_callbacks = {
 	AGGR_M_CALLBACK_FLAGS,
@@ -99,9 +113,8 @@ static mac_callbacks_t aggr_m_callbacks = {
 	aggr_m_stop,
 	aggr_m_promisc,
 	aggr_m_multicst,
-	aggr_m_unicst,
+	NULL,
 	aggr_m_tx,
-	aggr_m_resources,
 	aggr_m_ioctl,
 	aggr_m_capab_get
 };
@@ -113,11 +126,12 @@ aggr_grp_constructor(void *buf, void *arg, int kmflag)
 	aggr_grp_t *grp = buf;
 
 	bzero(grp, sizeof (*grp));
-	rw_init(&grp->lg_lock, NULL, RW_DRIVER, NULL);
-	rw_init(&grp->aggr.gl_lock, NULL, RW_DRIVER, NULL);
-
+	mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
+	rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
+	mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
 	grp->lg_link_state = LINK_STATE_UNKNOWN;
-
 	return (0);
 }
 
@@ -132,8 +146,11 @@ aggr_grp_destructor(void *buf, void *arg)
 		    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
 	}
 
-	rw_destroy(&grp->aggr.gl_lock);
-	rw_destroy(&grp->lg_lock);
+	mutex_destroy(&grp->lg_lacp_lock);
+	cv_destroy(&grp->lg_lacp_cv);
+	mutex_destroy(&grp->lg_port_lock);
+	cv_destroy(&grp->lg_port_cv);
+	rw_destroy(&grp->lg_tx_lock);
 }
 
 void
@@ -179,6 +196,51 @@ aggr_grp_count(void)
 }
 
 /*
+ * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
+ * requires the mac perimeter, this function holds a reference of the aggr
+ * and aggr won't call mac_unregister() until this reference drops to 0.
+ */
+void
+aggr_grp_port_hold(aggr_port_t *port)
+{
+	aggr_grp_t	*grp = port->lp_grp;
+
+	AGGR_PORT_REFHOLD(port);
+	mutex_enter(&grp->lg_port_lock);
+	grp->lg_port_ref++;
+	mutex_exit(&grp->lg_port_lock);
+}
+
+/*
+ * Release the reference of the grp and inform aggr_grp_delete() calling
+ * mac_unregister() is now safe.
+ */
+void
+aggr_grp_port_rele(aggr_port_t *port)
+{
+	aggr_grp_t	*grp = port->lp_grp;
+
+	mutex_enter(&grp->lg_port_lock);
+	if (--grp->lg_port_ref == 0)
+		cv_signal(&grp->lg_port_cv);
+	mutex_exit(&grp->lg_port_lock);
+	AGGR_PORT_REFRELE(port);
+}
+
+/*
+ * Wait for the port's lacp timer thread and the port's notification callback
+ * to exit.
+ */
+void
+aggr_grp_port_wait(aggr_grp_t *grp)
+{
+	mutex_enter(&grp->lg_port_lock);
+	if (grp->lg_port_ref != 0)
+		cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
+	mutex_exit(&grp->lg_port_lock);
+}
+
+/*
  * Attach a port to a link aggregation group.
  *
  * A port is attached to a link aggregation group once its speed
@@ -193,9 +255,8 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
 {
 	boolean_t link_state_changed = B_FALSE;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
-	ASSERT(RW_WRITE_HELD(&port->lp_lock));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 
 	if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
 		return (B_FALSE);
@@ -251,7 +312,7 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
 	/*
 	 * Set port's receive callback
 	 */
-	port->lp_mrh = mac_rx_add(port->lp_mh, aggr_recv_cb, (void *)port);
+	mac_rx_set(port->lp_mch, aggr_recv_cb, port);
 
 	/*
 	 * If LACP is OFF, the port can be used to send data as soon
@@ -270,28 +331,28 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
 }
 
 boolean_t
-aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port, boolean_t port_detach)
+aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
 {
 	boolean_t link_state_changed = B_FALSE;
 
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
-	ASSERT(RW_WRITE_HELD(&port->lp_lock));
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 
+	/* update state */
 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
 		return (B_FALSE);
 
-	mac_rx_remove(port->lp_mh, port->lp_mrh, B_FALSE);
+	mac_rx_clear(port->lp_mch);
 
 	aggr_grp_multicst_port(port, B_FALSE);
 
 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 		aggr_send_port_disable(port);
-	else if (port_detach)
+	else
 		aggr_lacp_port_detached(port);
 
-	/* update state */
 	port->lp_state = AGGR_PORT_STATE_STANDBY;
+
 	grp->lg_nattached_ports--;
 	if (grp->lg_nattached_ports == 0) {
 		/* the last attached MAC port of the group is being detached */
@@ -323,17 +384,15 @@ aggr_grp_update_ports_mac(aggr_grp_t *grp)
 {
 	aggr_port_t *cport;
 	boolean_t link_state_changed = B_FALSE;
+	mac_perim_handle_t mph;
 
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
-
-	if (grp->lg_closing)
-		return (link_state_changed);
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 
 	for (cport = grp->lg_ports; cport != NULL;
 	    cport = cport->lp_next) {
-		rw_enter(&cport->lp_lock, RW_WRITER);
-		if (aggr_port_unicst(cport, grp->lg_addr) != 0) {
-			if (aggr_grp_detach_port(grp, cport, B_TRUE))
+		mac_perim_enter_by_mh(cport->lp_mh, &mph);
+		if (aggr_port_unicst(cport) != 0) {
+			if (aggr_grp_detach_port(grp, cport))
 				link_state_changed = B_TRUE;
 		} else {
 			/*
@@ -346,7 +405,7 @@ aggr_grp_update_ports_mac(aggr_grp_t *grp)
 			if (aggr_grp_attach_port(grp, cport))
 				link_state_changed = B_TRUE;
 		}
-		rw_exit(&cport->lp_lock);
+		mac_perim_exit(mph);
 	}
 	return (link_state_changed);
 }
@@ -365,9 +424,8 @@ void
 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
-	ASSERT(RW_WRITE_HELD(&port->lp_lock));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 	ASSERT(mac_addr_changedp != NULL);
 	ASSERT(link_state_changedp != NULL);
 
@@ -394,9 +452,8 @@ aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
 		 * Update the actual port MAC address to the MAC address
 		 * of the group.
 		 */
-		if (aggr_port_unicst(port, grp->lg_addr) != 0) {
-			*link_state_changedp = aggr_grp_detach_port(grp, port,
-			    B_TRUE);
+		if (aggr_port_unicst(port) != 0) {
+			*link_state_changedp = aggr_grp_detach_port(grp, port);
 		} else {
 			/*
 			 * If a port was detached because of a previous
@@ -414,21 +471,25 @@ aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
  * Add a port to a link aggregation group.
  */
 static int
-aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t linkid, boolean_t force,
+aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
     aggr_port_t **pp)
 {
 	aggr_port_t *port, **cport;
+	mac_perim_handle_t mph;
 	int err;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+	/*
+	 * lg_mh could be NULL when the function is called during the creation
+	 * of the aggregation.
+	 */
+	ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
 
 	/* create new port */
-	err = aggr_port_create(linkid, force, &port);
+	err = aggr_port_create(grp, port_linkid, force, &port);
 	if (err != 0)
 		return (err);
 
-	rw_enter(&port->lp_lock, RW_WRITER);
+	mac_perim_enter_by_mh(port->lp_mh, &mph);
 
 	/* add port to list of group constituent ports */
 	cport = &grp->lg_ports;
@@ -446,19 +507,238 @@ aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t linkid, boolean_t force,
 	grp->lg_nports++;
 
 	aggr_lacp_init_port(port);
+	mac_perim_exit(mph);
+
+	if (pp != NULL)
+		*pp = port;
+
+	return (0);
+}
+
+/*
+ * Add a pseudo Rx ring for the given HW ring handle.
+ */
+static int
+aggr_add_pseudo_rx_ring(aggr_port_t *port,
+    aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
+{
+	aggr_pseudo_rx_ring_t	*ring;
+	int			err;
+	int			j;
+
+	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
+		ring = rx_grp->arg_rings + j;
+		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
+			break;
+	}
 
 	/*
-	 * Initialize the callback functions for this port. Note that this
-	 * can only be done after the lp_grp field is set.
+	 * No slot for this new Rx ring.
 	 */
-	aggr_port_init_callbacks(port);
+	if (j == MAX_RINGS_PER_GROUP)
+		return (EIO);
 
-	rw_exit(&port->lp_lock);
+	ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
+	ring->arr_hw_rh = hw_rh;
+	ring->arr_port = port;
+	rx_grp->arg_ring_cnt++;
 
-	if (pp != NULL)
-		*pp = port;
+	/*
+	 * The group is already registered, dynamically add a new ring to the
+	 * mac group.
+	 */
+	mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring);
+	if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
+		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
+		ring->arr_hw_rh = NULL;
+		ring->arr_port = NULL;
+		rx_grp->arg_ring_cnt--;
+		mac_hwring_teardown(hw_rh);
+	}
+	return (err);
+}
 
-	return (0);
+/*
+ * Remove the pseudo Rx ring of the given HW ring handle.
+ */
+static void
+aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
+{
+	aggr_pseudo_rx_ring_t	*ring;
+	int			j;
+
+	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
+		ring = rx_grp->arg_rings + j;
+		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
+		    ring->arr_hw_rh != hw_rh) {
+			continue;
+		}
+
+		mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
+
+		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
+		ring->arr_hw_rh = NULL;
+		ring->arr_port = NULL;
+		rx_grp->arg_ring_cnt--;
+		mac_hwring_teardown(hw_rh);
+		break;
+	}
+}
+
+/*
+ * This function is called to create pseudo rings over the hardware rings of
+ * the underlying device. Note that there is a 1:1 mapping between the pseudo
+ * RX rings of the aggr and the hardware rings of the underlying port.
+ */
+static int
+aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
+{
+	aggr_grp_t		*grp = port->lp_grp;
+	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
+	aggr_unicst_addr_t	*addr, *a;
+	mac_perim_handle_t	pmph;
+	int			hw_rh_cnt, i = 0, j;
+	int			err = 0;
+
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+	/*
+	 * This function must be called after the aggr registers its mac
+	 * and its RX group has been initialized.
+	 */
+	ASSERT(rx_grp->arg_gh != NULL);
+
+	/*
+	 * Get the list the the underlying HW rings.
+	 */
+	hw_rh_cnt = mac_hwrings_get(port->lp_mch, &port->lp_hwgh, hw_rh);
+
+	if (port->lp_hwgh != NULL) {
+		/*
+		 * Quiesce the HW ring and the mac srs on the ring. Note
+		 * that the HW ring will be restarted when the pseudo ring
+		 * is started. At that time all the packets will be
+		 * directly passed up to the pseudo RX ring and handled
+		 * by mac srs created over the pseudo RX ring.
+		 */
+		mac_rx_client_quiesce(port->lp_mch);
+		mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
+	}
+
+	/*
+	 * Add all the unicast addresses to the newly added port.
+	 */
+	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
+		if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
+			break;
+	}
+
+	for (i = 0; err == 0 && i < hw_rh_cnt; i++)
+		err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
+
+	if (err != 0) {
+		for (j = 0; j < i; j++)
+			aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
+
+		for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
+			aggr_port_remmac(port, a->aua_addr);
+
+		if (port->lp_hwgh != NULL) {
+			mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
+			mac_rx_client_restart(port->lp_mch);
+			port->lp_hwgh = NULL;
+		}
+	} else {
+		port->lp_grp_added = B_TRUE;
+	}
+done:
+	mac_perim_exit(pmph);
+	return (err);
+}
+
+/*
+ * This function is called by aggr to remove pseudo RX rings over the
+ * HW rings of the underlying port.
+ */
+static void
+aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
+{
+	aggr_grp_t		*grp = port->lp_grp;
+	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
+	aggr_unicst_addr_t	*addr;
+	mac_group_handle_t	hwgh;
+	mac_perim_handle_t	pmph;
+	int			hw_rh_cnt, i;
+
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+	if (!port->lp_grp_added)
+		goto done;
+
+	ASSERT(rx_grp->arg_gh != NULL);
+	hw_rh_cnt = mac_hwrings_get(port->lp_mch, &hwgh, hw_rh);
+
+	/*
+	 * If hw_rh_cnt is 0, it means that the underlying port does not
+	 * support RX rings. Directly return in this case.
+	 */
+	for (i = 0; i < hw_rh_cnt; i++)
+		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
+
+	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
+		aggr_port_remmac(port, addr->aua_addr);
+
+	if (port->lp_hwgh != NULL) {
+		port->lp_hwgh = NULL;
+
+		/*
+		 * First clear the permanent-quiesced flag of the RX srs then
+		 * restart the HW ring and the mac srs on the ring. Note that
+		 * the HW ring and associated SRS will soon been removed when
+		 * the port is removed from the aggr.
+		 */
+		mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
+		mac_rx_client_restart(port->lp_mch);
+	}
+
+	port->lp_grp_added = B_FALSE;
+done:
+	mac_perim_exit(pmph);
+}
+
+static int
+aggr_pseudo_disable_intr(mac_intr_handle_t ih)
+{
+	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
+	return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
+}
+
+static int
+aggr_pseudo_enable_intr(mac_intr_handle_t ih)
+{
+	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
+	return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
+}
+
+static int
+aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
+{
+	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
+	int err;
+
+	err = mac_hwring_start(rr_ring->arr_hw_rh);
+	if (err == 0)
+		rr_ring->arr_gen = mr_gen;
+	return (err);
+}
+
+static void
+aggr_pseudo_stop_ring(mac_ring_driver_t arg)
+{
+	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
+	mac_hwring_stop(rr_ring->arr_hw_rh);
 }
 
 /*
@@ -472,6 +752,7 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
 	aggr_grp_t *grp = NULL;
 	aggr_port_t *port;
 	boolean_t link_state_changed = B_FALSE;
+	mac_perim_handle_t mph, pmph;
 
 	/* get group corresponding to linkid */
 	rw_enter(&aggr_grp_lock, RW_READER);
@@ -481,10 +762,12 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
 		return (ENOENT);
 	}
 	AGGR_GRP_REFHOLD(grp);
-	rw_exit(&aggr_grp_lock);
 
-	AGGR_LACP_LOCK_WRITER(grp);
-	rw_enter(&grp->lg_lock, RW_WRITER);
+	/*
+	 * Hold the perimeter so that the aggregation won't be destroyed.
+	 */
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
+	rw_exit(&aggr_grp_lock);
 
 	/* add the specified ports to group */
 	for (i = 0; i < nports; i++) {
@@ -504,29 +787,53 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
 			goto bail;
 		}
 
+		/*
+		 * Create the pseudo ring for each HW ring of the underlying
+		 * port.
+		 */
+		rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
+		if (rc != 0)
+			goto bail;
+
+		mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+		/* set LACP mode */
+		aggr_port_lacp_set_mode(grp, port);
+
 		/* start port if group has already been started */
 		if (grp->lg_started) {
-			rw_enter(&port->lp_lock, RW_WRITER);
 			rc = aggr_port_start(port);
 			if (rc != 0) {
-				rw_exit(&port->lp_lock);
+				mac_perim_exit(pmph);
 				goto bail;
 			}
 
-			/* set port promiscuous mode */
-			rc = aggr_port_promisc(port, grp->lg_promisc);
-			if (rc != 0) {
-				rw_exit(&port->lp_lock);
-				goto bail;
+			/*
+			 * Turn on the promiscuous mode over the port when it
+			 * is requested to be turned on to receive the
+			 * non-primary address over a port, or the promiscous
+			 * mode is enabled over the aggr.
+			 */
+			if (grp->lg_promisc || port->lp_prom_addr != NULL) {
+				rc = aggr_port_promisc(port, B_TRUE);
+				if (rc != 0) {
+					mac_perim_exit(pmph);
+					goto bail;
+				}
 			}
-			rw_exit(&port->lp_lock);
 		}
+		mac_perim_exit(pmph);
 
 		/*
 		 * Attach each port if necessary.
 		 */
-		if (aggr_port_notify_link(grp, port, B_FALSE))
+		if (aggr_port_notify_link(grp, port))
 			link_state_changed = B_TRUE;
+
+		/*
+		 * Initialize the callback functions for this port.
+		 */
+		aggr_port_init_callbacks(port);
 	}
 
 	/* update the MAC address of the constituent ports */
@@ -539,64 +846,43 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
 bail:
 	if (rc != 0) {
 		/* stop and remove ports that have been added */
-		for (i = 0; i < nadded && !grp->lg_closing; i++) {
+		for (i = 0; i < nadded; i++) {
 			port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
 			ASSERT(port != NULL);
 			if (grp->lg_started) {
-				rw_enter(&port->lp_lock, RW_WRITER);
+				mac_perim_enter_by_mh(port->lp_mh, &pmph);
+				(void) aggr_port_promisc(port, B_FALSE);
 				aggr_port_stop(port);
-				rw_exit(&port->lp_lock);
+				mac_perim_exit(pmph);
 			}
+			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
 			(void) aggr_grp_rem_port(grp, port, NULL, NULL);
 		}
 	}
 
-	rw_exit(&grp->lg_lock);
-	AGGR_LACP_UNLOCK(grp);
-	if (rc == 0 && !grp->lg_closing)
+	if (rc == 0)
 		mac_resource_update(grp->lg_mh);
+	mac_perim_exit(mph);
 	AGGR_GRP_REFRELE(grp);
 	return (rc);
 }
 
-/*
- * Update properties of an existing link aggregation group.
- */
-int
-aggr_grp_modify(datalink_id_t linkid, aggr_grp_t *grp_arg, uint8_t update_mask,
-    uint32_t policy, boolean_t mac_fixed, const uchar_t *mac_addr,
-    aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer)
+static int
+aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
+    boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
+    aggr_lacp_timer_t lacp_timer)
 {
-	int rc = 0;
-	aggr_grp_t *grp = NULL;
 	boolean_t mac_addr_changed = B_FALSE;
 	boolean_t link_state_changed = B_FALSE;
+	mac_perim_handle_t pmph;
 
-	if (grp_arg == NULL) {
-		/* get group corresponding to linkid */
-		rw_enter(&aggr_grp_lock, RW_READER);
-		if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
-		    (mod_hash_val_t *)&grp) != 0) {
-			rc = ENOENT;
-			goto bail;
-		}
-		AGGR_LACP_LOCK_WRITER(grp);
-		rw_enter(&grp->lg_lock, RW_WRITER);
-	} else {
-		grp = grp_arg;
-		ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
-		ASSERT(RW_WRITE_HELD(&grp->lg_lock));
-	}
-
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock));
-	AGGR_GRP_REFHOLD(grp);
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 
 	/* validate fixed address if specified */
 	if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
 	    ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
 	    (mac_addr[0] & 0x01))) {
-		rc = EINVAL;
-		goto bail;
+		return (EINVAL);
 	}
 
 	/* update policy if requested */
@@ -616,11 +902,11 @@ aggr_grp_modify(datalink_id_t linkid, aggr_grp_t *grp_arg, uint8_t update_mask,
 			/* switch from user-supplied to automatic */
 			aggr_port_t *port = grp->lg_ports;
 
-			rw_enter(&port->lp_lock, RW_WRITER);
+			mac_perim_enter_by_mh(port->lp_mh, &pmph);
 			bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
 			grp->lg_mac_addr_port = port;
 			mac_addr_changed = B_TRUE;
-			rw_exit(&port->lp_lock);
+			mac_perim_exit(pmph);
 		}
 		grp->lg_addr_fixed = mac_fixed;
 	}
@@ -631,36 +917,51 @@ aggr_grp_modify(datalink_id_t linkid, aggr_grp_t *grp_arg, uint8_t update_mask,
 	if (update_mask & AGGR_MODIFY_LACP_MODE)
 		aggr_lacp_update_mode(grp, lacp_mode);
 
-	if ((update_mask & AGGR_MODIFY_LACP_TIMER) && !grp->lg_closing)
+	if (update_mask & AGGR_MODIFY_LACP_TIMER)
 		aggr_lacp_update_timer(grp, lacp_timer);
 
-bail:
-	if (grp != NULL && !grp->lg_closing) {
-		/*
-		 * If grp_arg is non-NULL, this function is called from
-		 * mac_unicst_set(), and the MAC_NOTE_UNICST notification
-		 * will be sent there.
-		 */
-		if ((grp_arg == NULL) && mac_addr_changed)
-			mac_unicst_update(grp->lg_mh, grp->lg_addr);
+	if (link_state_changed)
+		mac_link_update(grp->lg_mh, grp->lg_link_state);
 
-		if (link_state_changed)
-			mac_link_update(grp->lg_mh, grp->lg_link_state);
+	if (mac_addr_changed)
+		mac_unicst_update(grp->lg_mh, grp->lg_addr);
 
-	}
+	return (0);
+}
 
-	if (grp_arg == NULL) {
-		if (grp != NULL) {
-			rw_exit(&grp->lg_lock);
-			AGGR_LACP_UNLOCK(grp);
-		}
+/*
+ * Update properties of an existing link aggregation group.
+ */
+int
+aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
+    boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
+    aggr_lacp_timer_t lacp_timer)
+{
+	aggr_grp_t *grp = NULL;
+	mac_perim_handle_t mph;
+	int err;
+
+	/* get group corresponding to linkid */
+	rw_enter(&aggr_grp_lock, RW_READER);
+	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
+	    (mod_hash_val_t *)&grp) != 0) {
 		rw_exit(&aggr_grp_lock);
+		return (ENOENT);
 	}
+	AGGR_GRP_REFHOLD(grp);
 
-	if (grp != NULL)
-		AGGR_GRP_REFRELE(grp);
+	/*
+	 * Hold the perimeter so that the aggregation won't be destroyed.
+	 */
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
+	rw_exit(&aggr_grp_lock);
 
-	return (rc);
+	err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
+	    mac_addr, lacp_mode, lacp_timer);
+
+	mac_perim_exit(mph);
+	AGGR_GRP_REFRELE(grp);
+	return (err);
 }
 
 /*
@@ -676,6 +977,7 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 	aggr_port_t *port;
 	mac_register_t *mac;
 	boolean_t link_state_changed;
+	mac_perim_handle_t mph;
 	int err;
 	int i;
 
@@ -695,9 +997,6 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 
 	grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
 
-	AGGR_LACP_LOCK_WRITER(grp);
-	rw_enter(&grp->lg_lock, RW_WRITER);
-
 	grp->lg_refs = 1;
 	grp->lg_closing = B_FALSE;
 	grp->lg_force = force;
@@ -707,6 +1006,11 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 	grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
 	grp->lg_started = B_FALSE;
 	grp->lg_promisc = B_FALSE;
+	grp->lg_lacp_done = B_FALSE;
+	grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
+	grp->lg_lacp_rx_thread = thread_create(NULL, 0,
+	    aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
+	bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
 	aggr_lacp_init_grp(grp);
 
 	/* add MAC ports to group */
@@ -723,7 +1027,6 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 		goto bail;
 	}
 	grp->lg_key = key;
-	grp->lg_mcst_list = NULL;
 
 	for (i = 0; i < nports; i++) {
 		err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL);
@@ -748,17 +1051,6 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 		grp->lg_mac_addr_port = grp->lg_ports;
 	}
 
-	/*
-	 * Update the MAC address of the constituent ports.
-	 * None of the port is attached at this time, the link state of the
-	 * aggregation will not change.
-	 */
-	link_state_changed = aggr_grp_update_ports_mac(grp);
-	ASSERT(!link_state_changed);
-
-	/* update outbound load balancing policy */
-	aggr_send_update_policy(grp, policy);
-
 	/* set the initial group capabilities */
 	aggr_grp_capab_set(grp);
 
@@ -775,6 +1067,7 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 	mac->m_min_sdu = 0;
 	mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
 	mac->m_margin = aggr_grp_max_margin(grp);
+	mac->m_v12n = MAC_VIRT_LEVEL1;
 	err = mac_register(mac, &grp->lg_mh);
 	mac_free(mac);
 	if (err != 0)
@@ -782,9 +1075,23 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 
 	if ((err = dls_devnet_create(grp->lg_mh, grp->lg_linkid)) != 0) {
 		(void) mac_unregister(grp->lg_mh);
+		grp->lg_mh = NULL;
 		goto bail;
 	}
 
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
+
+	/*
+	 * Update the MAC address of the constituent ports.
+	 * None of the port is attached at this time, the link state of the
+	 * aggregation will not change.
+	 */
+	link_state_changed = aggr_grp_update_ports_mac(grp);
+	ASSERT(!link_state_changed);
+
+	/* update outbound load balancing policy */
+	aggr_send_update_policy(grp, policy);
+
 	/* set LACP mode */
 	aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
 
@@ -792,8 +1099,19 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 	 * Attach each port if necessary.
 	 */
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
-		if (aggr_port_notify_link(grp, port, B_FALSE))
+		/*
+		 * Create the pseudo ring for each HW ring of the underlying
+		 * port. Note that this is done after the aggr registers the
+		 * mac.
+		 */
+		VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
+		if (aggr_port_notify_link(grp, port))
 			link_state_changed = B_TRUE;
+
+		/*
+		 * Initialize the callback functions for this port.
+		 */
+		aggr_port_init_callbacks(port);
 	}
 
 	if (link_state_changed)
@@ -805,31 +1123,35 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
 	ASSERT(err == 0);
 	aggr_grp_cnt++;
 
-	rw_exit(&grp->lg_lock);
-	AGGR_LACP_UNLOCK(grp);
+	mac_perim_exit(mph);
 	rw_exit(&aggr_grp_lock);
 	return (0);
 
 bail:
-	if (grp != NULL) {
-		aggr_port_t *cport;
 
-		grp->lg_closing = B_TRUE;
-
-		port = grp->lg_ports;
-		while (port != NULL) {
-			cport = port->lp_next;
-			aggr_port_delete(port);
-			port = cport;
-		}
+	grp->lg_closing = B_TRUE;
 
-		rw_exit(&grp->lg_lock);
-		AGGR_LACP_UNLOCK(grp);
+	port = grp->lg_ports;
+	while (port != NULL) {
+		aggr_port_t *cport;
 
-		AGGR_GRP_REFRELE(grp);
+		cport = port->lp_next;
+		aggr_port_delete(port);
+		port = cport;
 	}
 
+	/*
+	 * Inform the lacp_rx thread to exit.
+	 */
+	mutex_enter(&grp->lg_lacp_lock);
+	grp->lg_lacp_done = B_TRUE;
+	cv_signal(&grp->lg_lacp_cv);
+	while (grp->lg_lacp_rx_thread != NULL)
+		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
+	mutex_exit(&grp->lg_lacp_lock);
+
 	rw_exit(&aggr_grp_lock);
+	AGGR_GRP_REFRELE(grp);
 	return (err);
 }
 
@@ -841,7 +1163,7 @@ aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
 {
 	aggr_port_t *port;
 
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
 		if (port->lp_linkid == linkid)
@@ -862,12 +1184,12 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
 	aggr_port_t **pport;
 	boolean_t mac_addr_changed = B_FALSE;
 	boolean_t link_state_changed = B_FALSE;
+	mac_perim_handle_t mph;
 	uint64_t val;
 	uint_t i;
 	uint_t stat;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 	ASSERT(grp->lg_nports > 1);
 	ASSERT(!grp->lg_closing);
 
@@ -881,9 +1203,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
 	}
 	*pport = port->lp_next;
 
-	atomic_add_32(&port->lp_closing, 1);
-
-	rw_enter(&port->lp_lock, RW_WRITER);
+	mac_perim_enter_by_mh(port->lp_mh, &mph);
 
 	/*
 	 * If the MAC address of the port being removed was assigned
@@ -900,7 +1220,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
 		mac_addr_changed = B_TRUE;
 	}
 
-	link_state_changed = aggr_grp_detach_port(grp, port, B_FALSE);
+	link_state_changed = aggr_grp_detach_port(grp, port);
 
 	/*
 	 * Add the counter statistics of the ports while it was aggregated
@@ -909,7 +1229,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
 	 * value of the counter at the moment it was added to the
 	 * aggregation.
 	 */
-	for (i = 0; i < MAC_NSTAT && !grp->lg_closing; i++) {
+	for (i = 0; i < MAC_NSTAT; i++) {
 		stat = i + MAC_STAT_MIN;
 		if (!MAC_STAT_ISACOUNTER(stat))
 			continue;
@@ -917,7 +1237,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
 		val -= port->lp_stat[i];
 		grp->lg_stat[i] += val;
 	}
-	for (i = 0; i < ETHER_NSTAT && !grp->lg_closing; i++) {
+	for (i = 0; i < ETHER_NSTAT; i++) {
 		stat = i + MACTYPE_STAT_MIN;
 		if (!ETHER_STAT_ISACOUNTER(stat))
 			continue;
@@ -927,8 +1247,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
 	}
 
 	grp->lg_nports--;
-
-	rw_exit(&port->lp_lock);
+	mac_perim_exit(mph);
 
 	aggr_port_delete(port);
 
@@ -960,6 +1279,7 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
 	aggr_port_t *port;
 	boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
 	boolean_t link_state_update = B_FALSE, link_state_changed;
+	mac_perim_handle_t mph, pmph;
 
 	/* get group corresponding to linkid */
 	rw_enter(&aggr_grp_lock, RW_READER);
@@ -969,10 +1289,12 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
 		return (ENOENT);
 	}
 	AGGR_GRP_REFHOLD(grp);
-	rw_exit(&aggr_grp_lock);
 
-	AGGR_LACP_LOCK_WRITER(grp);
-	rw_enter(&grp->lg_lock, RW_WRITER);
+	/*
+	 * Hold the perimeter so that the aggregation won't be destroyed.
+	 */
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
+	rw_exit(&aggr_grp_lock);
 
 	/* we need to keep at least one port per group */
 	if (nports >= grp->lg_nports) {
@@ -989,20 +1311,51 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
 		}
 	}
 
+	/* clear the promiscous mode for the specified ports */
+	for (i = 0; i < nports && rc == 0; i++) {
+		/* lookup port */
+		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
+		ASSERT(port != NULL);
+
+		mac_perim_enter_by_mh(port->lp_mh, &pmph);
+		rc = aggr_port_promisc(port, B_FALSE);
+		mac_perim_exit(pmph);
+	}
+	if (rc != 0) {
+		for (i = 0; i < nports; i++) {
+			port = aggr_grp_port_lookup(grp,
+			    ports[i].lp_linkid);
+			ASSERT(port != NULL);
+
+			/*
+			 * Turn the promiscuous mode back on if it is required
+			 * to receive the non-primary address over a port, or
+			 * the promiscous mode is enabled over the aggr.
+			 */
+			mac_perim_enter_by_mh(port->lp_mh, &pmph);
+			if (port->lp_started && (grp->lg_promisc ||
+			    port->lp_prom_addr != NULL)) {
+				(void) aggr_port_promisc(port, B_TRUE);
+			}
+			mac_perim_exit(pmph);
+		}
+		goto bail;
+	}
+
 	/* remove the specified ports from group */
-	for (i = 0; i < nports && !grp->lg_closing; i++) {
+	for (i = 0; i < nports; i++) {
 		/* lookup port */
 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
 		ASSERT(port != NULL);
 
 		/* stop port if group has already been started */
 		if (grp->lg_started) {
-			rw_enter(&port->lp_lock, RW_WRITER);
-			aggr_lacp_port_detached(port);
+			mac_perim_enter_by_mh(port->lp_mh, &pmph);
 			aggr_port_stop(port);
-			rw_exit(&port->lp_lock);
+			mac_perim_exit(pmph);
 		}
 
+		aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
 		/* remove port from group */
 		rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
 		    &link_state_changed);
@@ -1012,16 +1365,14 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
 	}
 
 bail:
-	rw_exit(&grp->lg_lock);
-	AGGR_LACP_UNLOCK(grp);
-	if (!grp->lg_closing) {
-		if (mac_addr_update)
-			mac_unicst_update(grp->lg_mh, grp->lg_addr);
-		if (link_state_update)
-			mac_link_update(grp->lg_mh, grp->lg_link_state);
-		if (rc == 0)
-			mac_resource_update(grp->lg_mh);
-	}
+	if (mac_addr_update)
+		mac_unicst_update(grp->lg_mh, grp->lg_addr);
+	if (link_state_update)
+		mac_link_update(grp->lg_mh, grp->lg_link_state);
+	if (rc == 0)
+		mac_resource_update(grp->lg_mh);
+
+	mac_perim_exit(mph);
 	AGGR_GRP_REFRELE(grp);
 
 	return (rc);
@@ -1032,9 +1383,9 @@ aggr_grp_delete(datalink_id_t linkid)
 {
 	aggr_grp_t *grp = NULL;
 	aggr_port_t *port, *cport;
-	lg_mcst_addr_t *mcst, *mcst_nextp;
 	datalink_id_t tmpid;
 	mod_hash_val_t val;
+	mac_perim_handle_t mph, pmph;
 	int err;
 
 	rw_enter(&aggr_grp_lock, RW_WRITER);
@@ -1051,68 +1402,69 @@ aggr_grp_delete(datalink_id_t linkid)
 	 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
 	 * dls_devnet_destroy() needs to delete.
 	 */
-	if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid)) != 0) {
+	if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
 		rw_exit(&aggr_grp_lock);
 		return (err);
 	}
 	ASSERT(linkid == tmpid);
 
-	AGGR_LACP_LOCK_WRITER(grp);
-	rw_enter(&grp->lg_lock, RW_WRITER);
-
 	/*
 	 * Unregister from the MAC service module. Since this can
 	 * fail if a client hasn't closed the MAC port, we gracefully
 	 * fail the operation.
 	 */
-	grp->lg_closing = B_TRUE;
 	if ((err = mac_disable(grp->lg_mh)) != 0) {
-		grp->lg_closing = B_FALSE;
-		rw_exit(&grp->lg_lock);
-		AGGR_LACP_UNLOCK(grp);
-
 		(void) dls_devnet_create(grp->lg_mh, linkid);
 		rw_exit(&aggr_grp_lock);
 		return (err);
 	}
+	(void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
+	ASSERT(grp == (aggr_grp_t *)val);
+
+	ASSERT(aggr_grp_cnt > 0);
+	aggr_grp_cnt--;
+	rw_exit(&aggr_grp_lock);
 
 	/*
-	 * Free the list of multicast addresses.
+	 * Inform the lacp_rx thread to exit.
 	 */
-	for (mcst = grp->lg_mcst_list; mcst != NULL; mcst = mcst_nextp) {
-		mcst_nextp = mcst->lg_mcst_nextp;
-		kmem_free(mcst, sizeof (lg_mcst_addr_t));
-	}
-	grp->lg_mcst_list = NULL;
+	mutex_enter(&grp->lg_lacp_lock);
+	grp->lg_lacp_done = B_TRUE;
+	cv_signal(&grp->lg_lacp_cv);
+	while (grp->lg_lacp_rx_thread != NULL)
+		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
+	mutex_exit(&grp->lg_lacp_lock);
 
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
+
+	grp->lg_closing = B_TRUE;
 	/* detach and free MAC ports associated with group */
 	port = grp->lg_ports;
 	while (port != NULL) {
 		cport = port->lp_next;
-		rw_enter(&port->lp_lock, RW_WRITER);
-		aggr_lacp_port_detached(port);
+		mac_perim_enter_by_mh(port->lp_mh, &pmph);
 		if (grp->lg_started)
 			aggr_port_stop(port);
-		(void) aggr_grp_detach_port(grp, port, B_FALSE);
-		rw_exit(&port->lp_lock);
+		(void) aggr_grp_detach_port(grp, port);
+		mac_perim_exit(pmph);
+		aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
 		aggr_port_delete(port);
 		port = cport;
 	}
 
-	VERIFY(mac_unregister(grp->lg_mh) == 0);
+	mac_perim_exit(mph);
 
-	rw_exit(&grp->lg_lock);
-	AGGR_LACP_UNLOCK(grp);
-
-	(void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
-	ASSERT(grp == (aggr_grp_t *)val);
+	/*
+	 * Wait for the port's lacp timer thread and its notification callback
+	 * to exit before calling mac_unregister() since both needs to access
+	 * the mac perimeter of the grp.
+	 */
+	aggr_grp_port_wait(grp);
 
-	ASSERT(aggr_grp_cnt > 0);
-	aggr_grp_cnt--;
+	VERIFY(mac_unregister(grp->lg_mh) == 0);
+	grp->lg_mh = NULL;
 
-	rw_exit(&aggr_grp_lock);
 	AGGR_GRP_REFRELE(grp);
-
 	return (0);
 }
 
@@ -1120,6 +1472,7 @@ void
 aggr_grp_free(aggr_grp_t *grp)
 {
 	ASSERT(grp->lg_refs == 0);
+	ASSERT(grp->lg_port_ref == 0);
 	if (grp->lg_key > AGGR_MAX_KEY) {
 		id_free(key_ids, grp->lg_key);
 		grp->lg_key = 0;
@@ -1134,6 +1487,7 @@ aggr_grp_info(datalink_id_t linkid, void *fn_arg,
 {
 	aggr_grp_t	*grp;
 	aggr_port_t	*port;
+	mac_perim_handle_t mph, pmph;
 	int		rc = 0;
 
 	rw_enter(&aggr_grp_lock, RW_READER);
@@ -1143,8 +1497,10 @@ aggr_grp_info(datalink_id_t linkid, void *fn_arg,
 		rw_exit(&aggr_grp_lock);
 		return (ENOENT);
 	}
+	AGGR_GRP_REFHOLD(grp);
 
-	rw_enter(&grp->lg_lock, RW_READER);
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
+	rw_exit(&aggr_grp_lock);
 
 	rc = new_grp_fn(fn_arg, grp->lg_linkid,
 	    (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
@@ -1155,32 +1511,21 @@ aggr_grp_info(datalink_id_t linkid, void *fn_arg,
 		goto bail;
 
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
-		rw_enter(&port->lp_lock, RW_READER);
+		mac_perim_enter_by_mh(port->lp_mh, &pmph);
 		rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
 		    port->lp_state, &port->lp_lacp.ActorOperPortState);
-		rw_exit(&port->lp_lock);
+		mac_perim_exit(pmph);
 
 		if (rc != 0)
 			goto bail;
 	}
 
 bail:
-	rw_exit(&grp->lg_lock);
-	rw_exit(&aggr_grp_lock);
+	mac_perim_exit(mph);
+	AGGR_GRP_REFRELE(grp);
 	return (rc);
 }
 
-static void
-aggr_m_resources(void *arg)
-{
-	aggr_grp_t *grp = arg;
-	aggr_port_t *port;
-
-	/* Call each port's m_resources function */
-	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
-		mac_resources(port->lp_mh);
-}
-
 /*ARGSUSED*/
 static void
 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
@@ -1230,10 +1575,11 @@ aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
 static int
 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
 {
-	aggr_grp_t	*grp = arg;
-	int		rval = 0;
+	aggr_grp_t		*grp = arg;
+	mac_perim_handle_t	mph;
+	int			rval = 0;
 
-	rw_enter(&grp->lg_lock, RW_READER);
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
 
 	switch (stat) {
 	case MAC_STAT_IFSPEED:
@@ -1253,7 +1599,7 @@ aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
 		rval = aggr_grp_stat(grp, stat, val);
 	}
 
-	rw_exit(&grp->lg_lock);
+	mac_perim_exit(mph);
 	return (rval);
 }
 
@@ -1262,9 +1608,9 @@ aggr_m_start(void *arg)
 {
 	aggr_grp_t *grp = arg;
 	aggr_port_t *port;
+	mac_perim_handle_t mph, pmph;
 
-	AGGR_LACP_LOCK_WRITER(grp);
-	rw_enter(&grp->lg_lock, RW_WRITER);
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
 
 	/*
 	 * Attempts to start all configured members of the group.
@@ -1272,23 +1618,27 @@ aggr_m_start(void *arg)
 	 * is received.
 	 */
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
-		rw_enter(&port->lp_lock, RW_WRITER);
+		mac_perim_enter_by_mh(port->lp_mh, &pmph);
 		if (aggr_port_start(port) != 0) {
-			rw_exit(&port->lp_lock);
+			mac_perim_exit(pmph);
 			continue;
 		}
 
-		/* set port promiscuous mode */
-		if (aggr_port_promisc(port, grp->lg_promisc) != 0)
-			aggr_port_stop(port);
-		rw_exit(&port->lp_lock);
+		/*
+		 * Turn on the promiscuous mode if it is required to receive
+		 * the non-primary address over a port, or the promiscous
+		 * mode is enabled over the aggr.
+		 */
+		if (grp->lg_promisc || port->lp_prom_addr != NULL) {
+			if (aggr_port_promisc(port, B_TRUE) != 0)
+				aggr_port_stop(port);
+		}
+		mac_perim_exit(pmph);
 	}
 
 	grp->lg_started = B_TRUE;
 
-	rw_exit(&grp->lg_lock);
-	AGGR_LACP_UNLOCK(grp);
-
+	mac_perim_exit(mph);
 	return (0);
 }
 
@@ -1297,21 +1647,22 @@ aggr_m_stop(void *arg)
 {
 	aggr_grp_t *grp = arg;
 	aggr_port_t *port;
+	mac_perim_handle_t mph, pmph;
 
-	AGGR_LACP_LOCK_WRITER(grp);
-	rw_enter(&grp->lg_lock, RW_WRITER);
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
 
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
-		rw_enter(&port->lp_lock, RW_WRITER);
-		aggr_lacp_port_detached(port);
+		mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+		/* reset port promiscuous mode */
+		(void) aggr_port_promisc(port, B_FALSE);
+
 		aggr_port_stop(port);
-		rw_exit(&port->lp_lock);
+		mac_perim_exit(pmph);
 	}
 
 	grp->lg_started = B_FALSE;
-
-	rw_exit(&grp->lg_lock);
-	AGGR_LACP_UNLOCK(grp);
+	mac_perim_exit(mph);
 }
 
 static int
@@ -1320,10 +1671,10 @@ aggr_m_promisc(void *arg, boolean_t on)
 	aggr_grp_t *grp = arg;
 	aggr_port_t *port;
 	boolean_t link_state_changed = B_FALSE;
+	mac_perim_handle_t mph, pmph;
 
-	AGGR_LACP_LOCK_WRITER(grp);
-	rw_enter(&grp->lg_lock, RW_WRITER);
 	AGGR_GRP_REFHOLD(grp);
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
 
 	ASSERT(!grp->lg_closing);
 
@@ -1331,25 +1682,30 @@ aggr_m_promisc(void *arg, boolean_t on)
 		goto bail;
 
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
-		rw_enter(&port->lp_lock, RW_WRITER);
+		int	err = 0;
+
+		mac_perim_enter_by_mh(port->lp_mh, &pmph);
 		AGGR_PORT_REFHOLD(port);
-		if (port->lp_started) {
-			if (aggr_port_promisc(port, on) != 0) {
-				if (aggr_grp_detach_port(grp, port, B_TRUE))
-					link_state_changed = B_TRUE;
-			} else {
-				/*
-				 * If a port was detached because of a previous
-				 * failure changing the promiscuity, the port
-				 * is reattached when it successfully changes
-				 * the promiscuity now, and this might cause
-				 * the link state of the aggregation to change.
-				 */
-				if (aggr_grp_attach_port(grp, port))
-					link_state_changed = B_TRUE;
-			}
+		if (!on && (port->lp_prom_addr == NULL))
+			err = aggr_port_promisc(port, B_FALSE);
+		else if (on && port->lp_started)
+			err = aggr_port_promisc(port, B_TRUE);
+
+		if (err != 0) {
+			if (aggr_grp_detach_port(grp, port))
+				link_state_changed = B_TRUE;
+		} else {
+			/*
+			 * If a port was detached because of a previous
+			 * failure changing the promiscuity, the port
+			 * is reattached when it successfully changes
+			 * the promiscuity now, and this might cause
+			 * the link state of the aggregation to change.
+			 */
+			if (aggr_grp_attach_port(grp, port))
+				link_state_changed = B_TRUE;
 		}
-		rw_exit(&port->lp_lock);
+		mac_perim_exit(pmph);
 		AGGR_PORT_REFRELE(port);
 	}
 
@@ -1359,13 +1715,49 @@ aggr_m_promisc(void *arg, boolean_t on)
 		mac_link_update(grp->lg_mh, grp->lg_link_state);
 
 bail:
-	rw_exit(&grp->lg_lock);
-	AGGR_LACP_UNLOCK(grp);
+	mac_perim_exit(mph);
 	AGGR_GRP_REFRELE(grp);
 
 	return (0);
 }
 
+static void
+aggr_grp_port_rename(const char *new_name, void *arg)
+{
+	/*
+	 * aggr port's mac client name is the format of "aggr link name" plus
+	 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
+	 */
+	int aggr_len, link_len, clnt_name_len, i;
+	char *str_end, *str_st, *str_del;
+	char aggr_name[MAXNAMELEN];
+	char link_name[MAXNAMELEN];
+	char *clnt_name;
+	aggr_grp_t *aggr_grp = arg;
+	aggr_port_t *aggr_port = aggr_grp->lg_ports;
+
+	for (i = 0; i < aggr_grp->lg_nports; i++) {
+		clnt_name = mac_client_name(aggr_port->lp_mch);
+		clnt_name_len = strlen(clnt_name);
+		str_st = clnt_name;
+		str_end = &(clnt_name[clnt_name_len]);
+		str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
+		ASSERT(str_del != NULL);
+		aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
+		link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
+		bzero(aggr_name, MAXNAMELEN);
+		bzero(link_name, MAXNAMELEN);
+		bcopy(clnt_name, aggr_name, aggr_len);
+		bcopy(str_del, link_name, link_len + 1);
+		bzero(clnt_name, MAXNAMELEN);
+		(void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
+		    link_name);
+
+		(void) mac_rename_primary(aggr_port->lp_mh, NULL);
+		aggr_port = aggr_port->lp_next;
+	}
+}
+
 /*
  * Initialize the capabilities that are advertised for the group
  * according to the capabilities of the constituent ports.
@@ -1381,51 +1773,245 @@ aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
 		*hcksum_txflags = grp->lg_hcksum_txflags;
 		break;
 	}
-	case MAC_CAPAB_POLL:
-		/*
-		 * There's nothing for us to fill in, we simply return
-		 * B_TRUE or B_FALSE to represent the group's support
-		 * status for this capability.
-		 */
-		return (grp->lg_gldv3_polling);
 	case MAC_CAPAB_NO_NATIVEVLAN:
 		return (!grp->lg_vlan);
 	case MAC_CAPAB_NO_ZCOPY:
 		return (!grp->lg_zcopy);
+	case MAC_CAPAB_RINGS: {
+		mac_capab_rings_t *cap_rings = cap_data;
+
+		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
+			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+			cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
+			cap_rings->mr_rget = aggr_fill_ring;
+
+			/*
+			 * An aggregation advertises only one (pseudo) RX
+			 * group, which virtualizes the main/primary group of
+			 * the underlying devices.
+			 */
+			cap_rings->mr_gnum = 1;
+			cap_rings->mr_gget = aggr_fill_group;
+			cap_rings->mr_gaddring = NULL;
+			cap_rings->mr_gremring = NULL;
+		} else {
+			return (B_FALSE);
+		}
+		break;
+	}
+	case MAC_CAPAB_AGGR:
+	{
+		mac_capab_aggr_t *aggr_cap;
+
+		if (cap_data != NULL) {
+			aggr_cap = cap_data;
+			aggr_cap->mca_rename_fn = aggr_grp_port_rename;
+			aggr_cap->mca_unicst = aggr_m_unicst;
+		}
+		return (B_TRUE);
+	}
 	default:
 		return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
+/*
+ * Callback funtion for MAC layer to register groups.
+ */
+static void
+aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
+    mac_group_info_t *infop, mac_group_handle_t gh)
+{
+	aggr_grp_t *grp = arg;
+	aggr_pseudo_rx_group_t *rx_group;
+
+	ASSERT(rtype == MAC_RING_TYPE_RX && index == 0);
+	rx_group = &grp->lg_rx_group;
+	rx_group->arg_gh = gh;
+	rx_group->arg_grp = grp;
+
+	infop->mgi_driver = (mac_group_driver_t)rx_group;
+	infop->mgi_start = NULL;
+	infop->mgi_stop = NULL;
+	infop->mgi_addmac = aggr_addmac;
+	infop->mgi_remmac = aggr_remmac;
+	infop->mgi_count = rx_group->arg_ring_cnt;
+}
+
+/*
+ * Callback funtion for MAC layer to register all rings.
+ */
+static void
+aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
+    const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+	aggr_grp_t	*grp = arg;
+
+	switch (rtype) {
+	case MAC_RING_TYPE_RX: {
+		aggr_pseudo_rx_group_t	*rx_group = &grp->lg_rx_group;
+		aggr_pseudo_rx_ring_t	*rx_ring;
+		mac_intr_t		aggr_mac_intr;
+
+		ASSERT(rg_index == 0);
+
+		ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
+		rx_ring = rx_group->arg_rings + index;
+		rx_ring->arr_rh = rh;
+
+		/*
+		 * Entrypoint to enable interrupt (disable poll) and
+		 * disable interrupt (enable poll).
+		 */
+		aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
+		aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
+		aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
+
+		infop->mri_driver = (mac_ring_driver_t)rx_ring;
+		infop->mri_start = aggr_pseudo_start_ring;
+		infop->mri_stop = aggr_pseudo_stop_ring;
+
+		infop->mri_intr = aggr_mac_intr;
+		infop->mri_poll = aggr_rx_poll;
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+static mblk_t *
+aggr_rx_poll(void *arg, int bytes_to_pickup)
+{
+	aggr_pseudo_rx_ring_t *rr_ring = arg;
+	aggr_port_t *port = rr_ring->arr_port;
+	aggr_grp_t *grp = port->lp_grp;
+	mblk_t *mp_chain, *mp, **mpp;
+
+	mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
+
+	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
+		return (mp_chain);
+
+	mpp = &mp_chain;
+	while ((mp = *mpp) != NULL) {
+		if (MBLKL(mp) >= sizeof (struct ether_header)) {
+			struct ether_header *ehp;
+
+			ehp = (struct ether_header *)mp->b_rptr;
+			if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
+				*mpp = mp->b_next;
+				mp->b_next = NULL;
+				aggr_recv_lacp(port,
+				    (mac_resource_handle_t)rr_ring, mp);
+				continue;
+			}
+		}
+
+		if (!port->lp_collector_enabled) {
+			*mpp = mp->b_next;
+			mp->b_next = NULL;
+			freemsg(mp);
+			continue;
+		}
+		mpp = &mp->b_next;
+	}
+	return (mp_chain);
+}
+
 static int
-aggr_grp_multicst(aggr_grp_t *grp, boolean_t add, const uint8_t *addrp)
+aggr_addmac(void *arg, const uint8_t *mac_addr)
 {
-	lg_mcst_addr_t	*mcst, **ppmcst;
+	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
+	aggr_unicst_addr_t	*addr, **pprev;
+	aggr_grp_t		*grp = rx_group->arg_grp;
+	aggr_port_t		*port, *p;
+	mac_perim_handle_t	mph;
+	int			err = 0;
+
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
+
+	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
+		mac_perim_exit(mph);
+		return (0);
+	}
 
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+	/*
+	 * Insert this mac address into the list of mac addresses owned by
+	 * the aggregation pseudo group.
+	 */
+	pprev = &rx_group->arg_macaddr;
+	while ((addr = *pprev) != NULL) {
+		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
+			mac_perim_exit(mph);
+			return (EEXIST);
+		}
+		pprev = &addr->aua_next;
+	}
+	addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
+	bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
+	addr->aua_next = NULL;
+	*pprev = addr;
 
-	for (ppmcst = &(grp->lg_mcst_list); (mcst = *ppmcst) != NULL;
-	    ppmcst = &(mcst->lg_mcst_nextp)) {
-		if (bcmp(mcst->lg_mcst_addr, addrp, MAXMACADDRLEN) == 0)
+	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
+		if ((err = aggr_port_addmac(port, mac_addr)) != 0)
 			break;
+
+	if (err != 0) {
+		for (p = grp->lg_ports; p != port; p = p->lp_next)
+			aggr_port_remmac(p, mac_addr);
+
+		*pprev = NULL;
+		kmem_free(addr, sizeof (aggr_unicst_addr_t));
 	}
 
-	if (add) {
-		if (mcst != NULL)
-			return (0);
-		mcst = kmem_zalloc(sizeof (lg_mcst_addr_t), KM_NOSLEEP);
-		if (mcst == NULL)
-			return (ENOMEM);
-		bcopy(addrp, mcst->lg_mcst_addr, MAXMACADDRLEN);
-		*ppmcst = mcst;
-	} else {
-		if (mcst == NULL)
-			return (ENOENT);
-		*ppmcst = mcst->lg_mcst_nextp;
-		kmem_free(mcst, sizeof (lg_mcst_addr_t));
+	mac_perim_exit(mph);
+	return (err);
+}
+
+static int
+aggr_remmac(void *arg, const uint8_t *mac_addr)
+{
+	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
+	aggr_unicst_addr_t	*addr, **pprev;
+	aggr_grp_t		*grp = rx_group->arg_grp;
+	aggr_port_t		*port;
+	mac_perim_handle_t	mph;
+	int			err = 0;
+
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
+
+	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
+		mac_perim_exit(mph);
+		return (0);
 	}
-	return (0);
+
+	/*
+	 * Insert this mac address into the list of mac addresses owned by
+	 * the aggregation pseudo group.
+	 */
+	pprev = &rx_group->arg_macaddr;
+	while ((addr = *pprev) != NULL) {
+		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
+			pprev = &addr->aua_next;
+			continue;
+		}
+		break;
+	}
+	if (addr == NULL) {
+		mac_perim_exit(mph);
+		return (EINVAL);
+	}
+
+	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
+		aggr_port_remmac(port, mac_addr);
+
+	*pprev = addr->aua_next;
+	kmem_free(addr, sizeof (aggr_unicst_addr_t));
+
+	mac_perim_exit(mph);
+	return (err);
 }
 
 /*
@@ -1438,17 +2024,14 @@ void
 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
 {
 	aggr_grp_t *grp = port->lp_grp;
-	lg_mcst_addr_t	*mcst;
 
-	ASSERT(RW_WRITE_HELD(&port->lp_lock));
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock));
+	ASSERT(MAC_PERIM_HELD(port->lp_mh));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 
 	if (!port->lp_started)
 		return;
 
-	for (mcst = grp->lg_mcst_list; mcst != NULL;
-	    mcst = mcst->lg_mcst_nextp)
-		(void) aggr_port_multicst(port, add, mcst->lg_mcst_addr);
+	mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
 }
 
 static int
@@ -1456,19 +2039,18 @@ aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
 {
 	aggr_grp_t *grp = arg;
 	aggr_port_t *port = NULL;
+	mac_perim_handle_t mph;
 	int err = 0, cerr;
 
-	rw_enter(&grp->lg_lock, RW_WRITER);
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
 		if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
 			continue;
 		cerr = aggr_port_multicst(port, add, addrp);
-		if (cerr == 0)
-			(void) aggr_grp_multicst(grp, add, addrp);
 		if (cerr != 0 && err == 0)
 			err = cerr;
 	}
-	rw_exit(&grp->lg_lock);
+	mac_perim_exit(mph);
 	return (err);
 }
 
@@ -1476,16 +2058,14 @@ static int
 aggr_m_unicst(void *arg, const uint8_t *macaddr)
 {
 	aggr_grp_t *grp = arg;
-	int rc;
+	mac_perim_handle_t mph;
+	int err;
 
-	AGGR_LACP_LOCK_WRITER(grp);
-	rw_enter(&grp->lg_lock, RW_WRITER);
-	rc = aggr_grp_modify(0, grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
+	err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
 	    0, 0);
-	rw_exit(&grp->lg_lock);
-	AGGR_LACP_UNLOCK(grp);
-
-	return (rc);
+	mac_perim_exit(mph);
+	return (err);
 }
 
 /*
@@ -1498,11 +2078,10 @@ aggr_grp_capab_set(aggr_grp_t *grp)
 	uint32_t cksum;
 	aggr_port_t *port;
 
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+	ASSERT(grp->lg_mh == NULL);
 	ASSERT(grp->lg_ports != NULL);
 
 	grp->lg_hcksum_txflags = (uint32_t)-1;
-	grp->lg_gldv3_polling = B_TRUE;
 	grp->lg_zcopy = B_TRUE;
 	grp->lg_vlan = B_TRUE;
 
@@ -1516,9 +2095,6 @@ aggr_grp_capab_set(aggr_grp_t *grp)
 
 		grp->lg_zcopy &=
 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
-
-		grp->lg_gldv3_polling &=
-		    mac_capab_get(port->lp_mh, MAC_CAPAB_POLL, NULL);
 	}
 }
 
@@ -1551,11 +2127,6 @@ aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
 		return (B_FALSE);
 	}
 
-	if (mac_capab_get(port->lp_mh, MAC_CAPAB_POLL, NULL) !=
-	    grp->lg_gldv3_polling) {
-		return (B_FALSE);
-	}
-
 	return (B_TRUE);
 }
 
@@ -1568,7 +2139,7 @@ aggr_grp_max_sdu(aggr_grp_t *grp)
 	uint_t max_sdu = (uint_t)-1;
 	aggr_port_t *port;
 
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+	ASSERT(grp->lg_mh == NULL);
 	ASSERT(grp->lg_ports != NULL);
 
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
@@ -1605,7 +2176,7 @@ aggr_grp_max_margin(aggr_grp_t *grp)
 	uint32_t margin = UINT32_MAX;
 	aggr_port_t *port;
 
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+	ASSERT(grp->lg_mh == NULL);
 	ASSERT(grp->lg_ports != NULL);
 
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
diff --git a/usr/src/uts/common/io/aggr/aggr_lacp.c b/usr/src/uts/common/io/aggr/aggr_lacp.c
index 09330f8df1..0916533c48 100644
--- a/usr/src/uts/common/io/aggr/aggr_lacp.c
+++ b/usr/src/uts/common/io/aggr/aggr_lacp.c
@@ -29,8 +29,10 @@
 
 #include <sys/types.h>
 #include <sys/sysmacros.h>
+#include <sys/callb.h>
 #include <sys/conf.h>
 #include <sys/cmn_err.h>
+#include <sys/disp.h>
 #include <sys/list.h>
 #include <sys/ksynch.h>
 #include <sys/kmem.h>
@@ -97,8 +99,8 @@ typedef struct lacp_sel_ports {
 static lacp_sel_ports_t *sel_ports = NULL;
 static kmutex_t lacp_sel_lock;
 
-static void periodic_timer_pop_locked(aggr_port_t *);
 static void periodic_timer_pop(void *);
+static void periodic_timer_pop_handler(aggr_port_t *);
 static void lacp_xmit_sm(aggr_port_t *);
 static void lacp_periodic_sm(aggr_port_t *);
 static void fill_lacp_pdu(aggr_port_t *, lacp_t *);
@@ -108,16 +110,18 @@ static void lacp_off(aggr_port_t *);
 static boolean_t valid_lacp_pdu(aggr_port_t *, lacp_t *);
 static void lacp_receive_sm(aggr_port_t *, lacp_t *);
 static void aggr_set_coll_dist(aggr_port_t *, boolean_t);
-static void aggr_set_coll_dist_locked(aggr_port_t *, boolean_t);
 static void start_wait_while_timer(aggr_port_t *);
 static void stop_wait_while_timer(aggr_port_t *);
 static void lacp_reset_port(aggr_port_t *);
 static void stop_current_while_timer(aggr_port_t *);
 static void current_while_timer_pop(void *);
+static void current_while_timer_pop_handler(aggr_port_t *);
 static void update_default_selected(aggr_port_t *);
 static boolean_t update_selected(aggr_port_t *, lacp_t *);
 static boolean_t lacp_sel_ports_add(aggr_port_t *);
 static void lacp_sel_ports_del(aggr_port_t *);
+static void wait_while_timer_pop(void *);
+static void wait_while_timer_pop_handler(aggr_port_t *);
 
 void
 aggr_lacp_init(void)
@@ -132,13 +136,96 @@ aggr_lacp_fini(void)
 }
 
 /*
+ * The following functions are used for handling LACP timers.
+ *
+ * Note that we cannot fully rely on the aggr's mac perimeter in the timeout
+ * handler routine, otherwise it may cause deadlock with the untimeout() call
+ * which is usually called with the mac perimeter held. Instead, a
+ * lacp_timer_lock mutex is introduced, which protects a bitwise flag
+ * (lacp_timer_bits). This flag is set/cleared by timeout()/stop_timer()
+ * routines and is checked by a dedicated thread, that executes the real
+ * timeout operation.
+ */
+static void
+aggr_port_timer_thread(void *arg)
+{
+	aggr_port_t		*port = arg;
+	aggr_lacp_port_t	*pl = &port->lp_lacp;
+	aggr_grp_t		*grp = port->lp_grp;
+	uint32_t		lacp_timer_bits;
+	mac_perim_handle_t	mph;
+	callb_cpr_t		cprinfo;
+
+	CALLB_CPR_INIT(&cprinfo, &pl->lacp_timer_lock, callb_generic_cpr,
+	    "aggr_port_timer_thread");
+
+	mutex_enter(&pl->lacp_timer_lock);
+
+	for (;;) {
+
+		if ((lacp_timer_bits = pl->lacp_timer_bits) == 0) {
+			CALLB_CPR_SAFE_BEGIN(&cprinfo);
+			cv_wait(&pl->lacp_timer_cv, &pl->lacp_timer_lock);
+			CALLB_CPR_SAFE_END(&cprinfo, &pl->lacp_timer_lock);
+			continue;
+		}
+		pl->lacp_timer_bits = 0;
+
+		if (lacp_timer_bits & LACP_THREAD_EXIT)
+			break;
+
+		if (lacp_timer_bits & LACP_PERIODIC_TIMEOUT)
+			pl->periodic_timer.id = 0;
+		if (lacp_timer_bits & LACP_WAIT_WHILE_TIMEOUT)
+			pl->wait_while_timer.id = 0;
+		if (lacp_timer_bits & LACP_CURRENT_WHILE_TIMEOUT)
+			pl->current_while_timer.id = 0;
+
+		mutex_exit(&pl->lacp_timer_lock);
+
+		mac_perim_enter_by_mh(grp->lg_mh, &mph);
+		if (port->lp_closing) {
+			mac_perim_exit(mph);
+			mutex_enter(&pl->lacp_timer_lock);
+			break;
+		}
+
+		if (lacp_timer_bits & LACP_PERIODIC_TIMEOUT)
+			periodic_timer_pop_handler(port);
+		if (lacp_timer_bits & LACP_WAIT_WHILE_TIMEOUT)
+			wait_while_timer_pop_handler(port);
+		if (lacp_timer_bits & LACP_CURRENT_WHILE_TIMEOUT)
+			current_while_timer_pop_handler(port);
+		mac_perim_exit(mph);
+
+		mutex_enter(&pl->lacp_timer_lock);
+		if (pl->lacp_timer_bits & LACP_THREAD_EXIT)
+			break;
+	}
+
+	pl->lacp_timer_bits = 0;
+	pl->lacp_timer_thread = NULL;
+	cv_broadcast(&pl->lacp_timer_cv);
+
+	/* CALLB_CPR_EXIT drops the lock */
+	CALLB_CPR_EXIT(&cprinfo);
+
+	/*
+	 * Release the reference of the grp so aggr_grp_delete() can call
+	 * mac_unregister() safely.
+	 */
+	aggr_grp_port_rele(port);
+	thread_exit();
+}
+
+/*
  * Set the port LACP state to SELECTED. Returns B_FALSE if the operation
  * could not be performed due to a memory allocation error, B_TRUE otherwise.
  */
 static boolean_t
 lacp_port_select(aggr_port_t *portp)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
 	if (!lacp_sel_ports_add(portp))
 		return (B_FALSE);
@@ -152,7 +239,9 @@ lacp_port_select(aggr_port_t *portp)
 static void
 lacp_port_unselect(aggr_port_t *portp)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	aggr_grp_t	*grp = portp->lp_grp;
+
+	ASSERT((grp->lg_mh == NULL) || MAC_PERIM_HELD(grp->lg_mh));
 
 	lacp_sel_ports_del(portp);
 	portp->lp_lacp.sm.selected = AGGR_UNSELECTED;
@@ -180,9 +269,8 @@ aggr_lacp_init_port(aggr_port_t *portp)
 	aggr_grp_t *aggrp = portp->lp_grp;
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(aggrp));
-	ASSERT(RW_LOCK_HELD(&aggrp->lg_lock));
-	ASSERT(RW_LOCK_HELD(&portp->lp_lock));
+	ASSERT(aggrp->lg_mh == NULL || MAC_PERIM_HELD(aggrp->lg_mh));
+	ASSERT(MAC_PERIM_HELD(portp->lp_mh));
 
 	/* actor port # */
 	pl->ActorPortNumber = portp->lp_portid;
@@ -251,6 +339,25 @@ aggr_lacp_init_port(aggr_port_t *portp)
 
 	pl->wait_while_timer.id = 0;
 	pl->wait_while_timer.val = AGGREGATE_WAIT_TIME;
+
+	pl->lacp_timer_bits = 0;
+
+	mutex_init(&pl->lacp_timer_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&pl->lacp_timer_cv, NULL, CV_DRIVER, NULL);
+
+	pl->lacp_timer_thread = thread_create(NULL, 0, aggr_port_timer_thread,
+	    portp, 0, &p0, TS_RUN, minclsyspri);
+
+	/*
+	 * Hold a reference of the grp and the port and this reference will
+	 * be release when the thread exits.
+	 *
+	 * The reference on the port is used for aggr_port_delete() to
+	 * continue without waiting for the thread to exit; the reference
+	 * on the grp is used for aggr_grp_delete() to wait for the thread
+	 * to exit before calling mac_unregister().
+	 */
+	aggr_grp_port_hold(portp);
 }
 
 /*
@@ -264,7 +371,7 @@ lacp_reset_port(aggr_port_t *portp)
 {
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
 	pl->NTT = B_FALSE;			/* need to transmit */
 
@@ -306,8 +413,8 @@ lacp_reset_port(aggr_port_t *portp)
 static void
 aggr_lacp_mcast_on(aggr_port_t *port)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(port->lp_grp));
-	ASSERT(RW_WRITE_HELD(&port->lp_lock));
+	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 
 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
 		return;
@@ -319,8 +426,8 @@ aggr_lacp_mcast_on(aggr_port_t *port)
 static void
 aggr_lacp_mcast_off(aggr_port_t *port)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(port->lp_grp));
-	ASSERT(RW_WRITE_HELD(&port->lp_lock));
+	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 
 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
 		return;
@@ -332,26 +439,35 @@ aggr_lacp_mcast_off(aggr_port_t *port)
 static void
 start_periodic_timer(aggr_port_t *portp)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	aggr_lacp_port_t *pl = &portp->lp_lacp;
+
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
-	if (portp->lp_lacp.periodic_timer.id == 0) {
-		portp->lp_lacp.periodic_timer.id =
-		    timeout(periodic_timer_pop, portp,
+	mutex_enter(&pl->lacp_timer_lock);
+	if (pl->periodic_timer.id == 0) {
+		pl->periodic_timer.id = timeout(periodic_timer_pop, portp,
 		    drv_usectohz(1000000 * portp->lp_lacp.periodic_timer.val));
 	}
+	mutex_exit(&pl->lacp_timer_lock);
 }
 
 static void
 stop_periodic_timer(aggr_port_t *portp)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	aggr_lacp_port_t *pl = &portp->lp_lacp;
+	timeout_id_t id;
+
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
-	if (portp->lp_lacp.periodic_timer.id != 0) {
-		AGGR_LACP_UNLOCK(portp->lp_grp);
-		(void) untimeout(portp->lp_lacp.periodic_timer.id);
-		AGGR_LACP_LOCK_WRITER(portp->lp_grp);
-		portp->lp_lacp.periodic_timer.id = 0;
+	mutex_enter(&pl->lacp_timer_lock);
+	if ((id = pl->periodic_timer.id) != 0) {
+		pl->lacp_timer_bits &= ~LACP_PERIODIC_TIMEOUT;
+		pl->periodic_timer.id = 0;
 	}
+	mutex_exit(&pl->lacp_timer_lock);
+
+	if (id != 0)
+		(void) untimeout(id);
 }
 
 /*
@@ -360,13 +476,29 @@ stop_periodic_timer(aggr_port_t *portp)
  * LACPDU. We then set the periodic state and let
  * the periodic state machine restart the timer.
  */
+static void
+periodic_timer_pop(void *data)
+{
+	aggr_port_t *portp = data;
+	aggr_lacp_port_t *pl = &portp->lp_lacp;
+
+	mutex_enter(&pl->lacp_timer_lock);
+	pl->lacp_timer_bits |= LACP_PERIODIC_TIMEOUT;
+	cv_broadcast(&pl->lacp_timer_cv);
+	mutex_exit(&pl->lacp_timer_lock);
+}
 
+/*
+ * When the timer pops, we arrive here to
+ * clear out LACPDU count as well as transmit an
+ * LACPDU. We then set the periodic state and let
+ * the periodic state machine restart the timer.
+ */
 static void
-periodic_timer_pop_locked(aggr_port_t *portp)
+periodic_timer_pop_handler(aggr_port_t *portp)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
-	portp->lp_lacp.periodic_timer.id = NULL;
 	portp->lp_lacp_stats.LACPDUsTx = 0;
 
 	/* current timestamp */
@@ -390,19 +522,6 @@ periodic_timer_pop_locked(aggr_port_t *portp)
 	lacp_periodic_sm(portp);
 }
 
-static void
-periodic_timer_pop(void *data)
-{
-	aggr_port_t *portp = data;
-
-	if (portp->lp_closing)
-		return;
-
-	AGGR_LACP_LOCK_WRITER(portp->lp_grp);
-	periodic_timer_pop_locked(portp);
-	AGGR_LACP_UNLOCK(portp->lp_grp);
-}
-
 /*
  * Invoked from:
  *	- startup upon aggregation
@@ -417,7 +536,7 @@ lacp_periodic_sm(aggr_port_t *portp)
 	lacp_periodic_state_t oldstate = portp->lp_lacp.sm.periodic_state;
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
 	/* LACP_OFF state not in specification so check here.  */
 	if (!pl->sm.lacp_on) {
@@ -465,7 +584,7 @@ lacp_periodic_sm(aggr_port_t *portp)
 		 * a LACPDU.
 		 */
 		stop_periodic_timer(portp);
-		periodic_timer_pop_locked(portp);
+		periodic_timer_pop_handler(portp);
 	}
 
 	/* Rearm timer with value provided by partner */
@@ -483,9 +602,8 @@ lacp_xmit_sm(aggr_port_t *portp)
 	size_t	len;
 	mblk_t  *mp;
 	hrtime_t now, elapsed;
-	const mac_txinfo_t *mtp;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
 	/* LACP_OFF state not in specification so check here.  */
 	if (!pl->sm.lacp_on || !pl->NTT || !portp->lp_started)
@@ -534,12 +652,7 @@ lacp_xmit_sm(aggr_port_t *portp)
 	fill_lacp_pdu(portp,
 	    (lacp_t *)(mp->b_rptr + sizeof (struct ether_header)));
 
-	/*
-	 * Store the transmit info pointer locally in case it changes between
-	 * loading mt_fn and mt_arg.
-	 */
-	mtp = portp->lp_txinfo;
-	mtp->mt_fn(mtp->mt_arg, mp);
+	(void) mac_tx(portp->lp_mch, mp, 0, MAC_DROP_ON_NO_DESC, NULL);
 
 	pl->NTT = B_FALSE;
 	portp->lp_lacp_stats.LACPDUsTx++;
@@ -563,15 +676,14 @@ fill_lacp_pdu(aggr_port_t *portp, lacp_t *lacp)
 {
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 	aggr_grp_t *aggrp = portp->lp_grp;
+	mac_perim_handle_t pmph;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	ASSERT(MAC_PERIM_HELD(aggrp->lg_mh));
+	mac_perim_enter_by_mh(portp->lp_mh, &pmph);
 
 	lacp->subtype = LACP_SUBTYPE;
 	lacp->version = LACP_VERSION;
 
-	rw_enter(&aggrp->lg_lock, RW_READER);
-	rw_enter(&portp->lp_lock, RW_READER);
-
 	/*
 	 * Actor Information
 	 */
@@ -609,8 +721,7 @@ fill_lacp_pdu(aggr_port_t *portp, lacp_t *lacp)
 	lacp->tlv_terminator = TERMINATOR_TLV;
 	lacp->terminator_len = 0x0;
 
-	rw_exit(&portp->lp_lock);
-	rw_exit(&aggrp->lg_lock);
+	mac_perim_exit(pmph);
 }
 
 /*
@@ -633,7 +744,7 @@ lacp_mux_sm(aggr_port_t *portp)
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 	lacp_mux_state_t oldstate = pl->sm.mux_state;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(aggrp));
+	ASSERT(MAC_PERIM_HELD(aggrp->lg_mh));
 
 	/* LACP_OFF state not in specification so check here.  */
 	if (!pl->sm.lacp_on) {
@@ -788,29 +899,28 @@ again:
 } /* lacp_mux_sm */
 
 
-static void
+static int
 receive_marker_pdu(aggr_port_t *portp, mblk_t *mp)
 {
 	marker_pdu_t		*markerp = (marker_pdu_t *)mp->b_rptr;
-	const mac_txinfo_t	*mtp;
 
-	AGGR_LACP_LOCK_WRITER(portp->lp_grp);
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
 	AGGR_LACP_DBG(("trunk link: (%d): MARKER PDU received:\n",
 	    portp->lp_linkid));
 
 	/* LACP_OFF state not in specification so check here.  */
 	if (!portp->lp_lacp.sm.lacp_on)
-		goto bail;
+		return (-1);
 
 	if (MBLKL(mp) < sizeof (marker_pdu_t))
-		goto bail;
+		return (-1);
 
 	if (markerp->version != MARKER_VERSION) {
 		AGGR_LACP_DBG(("trunk link (%d): Malformed MARKER PDU: "
 		    "version = %d does not match s/w version %d\n",
 		    portp->lp_linkid, markerp->version, MARKER_VERSION));
-		goto bail;
+		return (-1);
 	}
 
 	if (markerp->tlv_marker == MARKER_RESPONSE_TLV) {
@@ -818,21 +928,21 @@ receive_marker_pdu(aggr_port_t *portp, mblk_t *mp)
 		AGGR_LACP_DBG(("trunk link (%d): MARKER RESPONSE PDU: "
 		    " MARKER TLV = %d - We don't send out info type!\n",
 		    portp->lp_linkid, markerp->tlv_marker));
-		goto bail;
+		return (-1);
 	}
 
 	if (markerp->tlv_marker != MARKER_INFO_TLV) {
 		AGGR_LACP_DBG(("trunk link (%d): Malformed MARKER PDU: "
 		    " MARKER TLV = %d \n", portp->lp_linkid,
 		    markerp->tlv_marker));
-		goto bail;
+		return (-1);
 	}
 
 	if (markerp->marker_len != MARKER_INFO_RESPONSE_LENGTH) {
 		AGGR_LACP_DBG(("trunk link (%d): Malformed MARKER PDU: "
 		    " MARKER length = %d \n", portp->lp_linkid,
 		    markerp->marker_len));
-		goto bail;
+		return (-1);
 	}
 
 	if (markerp->requestor_port != portp->lp_lacp.PartnerOperPortNum) {
@@ -840,7 +950,7 @@ receive_marker_pdu(aggr_port_t *portp, mblk_t *mp)
 		    " MARKER Port %d not equal to Partner port %d\n",
 		    portp->lp_linkid, markerp->requestor_port,
 		    portp->lp_lacp.PartnerOperPortNum));
-		goto bail;
+		return (-1);
 	}
 
 	if (ether_cmp(&markerp->system_id,
@@ -848,7 +958,7 @@ receive_marker_pdu(aggr_port_t *portp, mblk_t *mp)
 		AGGR_LACP_DBG(("trunk link (%d): MARKER PDU: "
 		    " MARKER MAC not equal to Partner MAC\n",
 		    portp->lp_linkid));
-		goto bail;
+		return (-1);
 	}
 
 	/*
@@ -861,23 +971,9 @@ receive_marker_pdu(aggr_port_t *portp, mblk_t *mp)
 	ASSERT(MBLKHEAD(mp) >= sizeof (struct ether_header));
 	mp->b_rptr -= sizeof (struct ether_header);
 	fill_lacp_ether(portp, (struct ether_header *)mp->b_rptr);
-
-	/*
-	 * Store the transmit info pointer locally in case it changes between
-	 * loading mt_fn and mt_arg.
-	 */
-	mtp = portp->lp_txinfo;
-	AGGR_LACP_UNLOCK(portp->lp_grp);
-
-	mtp->mt_fn(mtp->mt_arg, mp);
-	return;
-
-bail:
-	AGGR_LACP_UNLOCK(portp->lp_grp);
-	freemsg(mp);
+	return (0);
 }
 
-
 /*
  * Update the LACP mode (off, active, or passive) of the specified group.
  */
@@ -887,8 +983,8 @@ aggr_lacp_update_mode(aggr_grp_t *grp, aggr_lacp_mode_t mode)
 	aggr_lacp_mode_t old_mode = grp->lg_lacp_mode;
 	aggr_port_t *port;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	ASSERT(!grp->lg_closing);
 
 	if (mode == old_mode)
 		return;
@@ -904,20 +1000,12 @@ aggr_lacp_update_mode(aggr_grp_t *grp, aggr_lacp_mode_t mode)
 			/* OFF -> {PASSIVE,ACTIVE} */
 			/* turn OFF Collector_Distributor */
 			aggr_set_coll_dist(port, B_FALSE);
-			rw_enter(&port->lp_lock, RW_WRITER);
 			lacp_on(port);
-			if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
-				aggr_lacp_port_attached(port);
-			rw_exit(&port->lp_lock);
 		} else if (mode == AGGR_LACP_OFF) {
 			/* {PASSIVE,ACTIVE} -> OFF */
-			rw_enter(&port->lp_lock, RW_WRITER);
 			lacp_off(port);
-			rw_exit(&port->lp_lock);
-			if (!grp->lg_closing) {
-				/* Turn ON Collector_Distributor */
-				aggr_set_coll_dist(port, B_TRUE);
-			}
+			/* Turn ON Collector_Distributor */
+			aggr_set_coll_dist(port, B_TRUE);
 		} else {
 			/* PASSIVE->ACTIVE or ACTIVE->PASSIVE */
 			port->lp_lacp.sm.begin = B_TRUE;
@@ -928,9 +1016,6 @@ aggr_lacp_update_mode(aggr_grp_t *grp, aggr_lacp_mode_t mode)
 			lacp_receive_sm(port, NULL);
 			lacp_mux_sm(port);
 		}
-
-		if (grp->lg_closing)
-			break;
 	}
 }
 
@@ -943,8 +1028,7 @@ aggr_lacp_update_timer(aggr_grp_t *grp, aggr_lacp_timer_t timer)
 {
 	aggr_port_t *port;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 
 	if (timer == grp->aggr.PeriodicTimer)
 		return;
@@ -958,6 +1042,32 @@ aggr_lacp_update_timer(aggr_grp_t *grp, aggr_lacp_timer_t timer)
 	}
 }
 
+void
+aggr_port_lacp_set_mode(aggr_grp_t *grp, aggr_port_t *port)
+{
+	aggr_lacp_mode_t	mode;
+	aggr_lacp_timer_t	timer;
+
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+
+	mode = grp->lg_lacp_mode;
+	timer = grp->aggr.PeriodicTimer;
+
+	port->lp_lacp.ActorAdminPortState.bit.activity =
+	    port->lp_lacp.ActorOperPortState.bit.activity =
+	    (mode == AGGR_LACP_ACTIVE);
+
+	port->lp_lacp.ActorAdminPortState.bit.timeout =
+	    port->lp_lacp.ActorOperPortState.bit.timeout =
+	    (timer == AGGR_LACP_TIMER_SHORT);
+
+	if (mode == AGGR_LACP_OFF) {
+		/* Turn ON Collector_Distributor */
+		aggr_set_coll_dist(port, B_TRUE);
+	} else { /* LACP_ACTIVE/PASSIVE */
+		lacp_on(port);
+	}
+}
 
 /*
  * Sets the initial LACP mode (off, active, passive) and LACP timer
@@ -969,30 +1079,13 @@ aggr_lacp_set_mode(aggr_grp_t *grp, aggr_lacp_mode_t mode,
 {
 	aggr_port_t *port;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 
 	grp->lg_lacp_mode = mode;
 	grp->aggr.PeriodicTimer = timer;
 
-	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
-		port->lp_lacp.ActorAdminPortState.bit.activity =
-		    port->lp_lacp.ActorOperPortState.bit.activity =
-		    (mode == AGGR_LACP_ACTIVE);
-
-		port->lp_lacp.ActorAdminPortState.bit.timeout =
-		    port->lp_lacp.ActorOperPortState.bit.timeout =
-		    (timer == AGGR_LACP_TIMER_SHORT);
-
-		if (grp->lg_lacp_mode == AGGR_LACP_OFF) {
-			/* Turn ON Collector_Distributor */
-			aggr_set_coll_dist(port, B_TRUE);
-		} else { /* LACP_ACTIVE/PASSIVE */
-			rw_enter(&port->lp_lock, RW_WRITER);
-			lacp_on(port);
-			rw_exit(&port->lp_lock);
-		}
-	}
+	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
+		aggr_port_lacp_set_mode(grp, port);
 }
 
 /*
@@ -1148,7 +1241,7 @@ lacp_selection_logic(aggr_port_t *portp)
 	boolean_t reset_mac = B_FALSE;
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(aggrp));
+	ASSERT(MAC_PERIM_HELD(aggrp->lg_mh));
 
 	/* LACP_OFF state not in specification so check here.  */
 	if (!pl->sm.lacp_on) {
@@ -1377,47 +1470,65 @@ static void
 wait_while_timer_pop(void *data)
 {
 	aggr_port_t *portp = data;
+	aggr_lacp_port_t *pl = &portp->lp_lacp;
 
-	if (portp->lp_closing)
-		return;
+	mutex_enter(&pl->lacp_timer_lock);
+	pl->lacp_timer_bits |= LACP_WAIT_WHILE_TIMEOUT;
+	cv_broadcast(&pl->lacp_timer_cv);
+	mutex_exit(&pl->lacp_timer_lock);
+}
 
-	AGGR_LACP_LOCK_WRITER(portp->lp_grp);
+/*
+ * wait_while_timer_pop_handler - When the timer pops, we arrive here to
+ *			set ready_n and trigger the selection logic.
+ */
+static void
+wait_while_timer_pop_handler(aggr_port_t *portp)
+{
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
 	AGGR_LACP_DBG(("trunk link:(%d): wait_while_timer pop \n",
 	    portp->lp_linkid));
-	portp->lp_lacp.wait_while_timer.id = 0;
 	portp->lp_lacp.sm.ready_n = B_TRUE;
 
 	lacp_selection_logic(portp);
-	AGGR_LACP_UNLOCK(portp->lp_grp);
 }
 
 static void
 start_wait_while_timer(aggr_port_t *portp)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	aggr_lacp_port_t *pl = &portp->lp_lacp;
+
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
-	if (portp->lp_lacp.wait_while_timer.id == 0) {
-		portp->lp_lacp.wait_while_timer.id =
+	mutex_enter(&pl->lacp_timer_lock);
+	if (pl->wait_while_timer.id == 0) {
+		pl->wait_while_timer.id =
 		    timeout(wait_while_timer_pop, portp,
 		    drv_usectohz(1000000 *
 		    portp->lp_lacp.wait_while_timer.val));
 	}
+	mutex_exit(&pl->lacp_timer_lock);
 }
 
 
 static void
-stop_wait_while_timer(portp)
-aggr_port_t *portp;
+stop_wait_while_timer(aggr_port_t *portp)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	aggr_lacp_port_t *pl = &portp->lp_lacp;
+	timeout_id_t id;
+
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
-	if (portp->lp_lacp.wait_while_timer.id != 0) {
-		AGGR_LACP_UNLOCK(portp->lp_grp);
-		(void) untimeout(portp->lp_lacp.wait_while_timer.id);
-		AGGR_LACP_LOCK_WRITER(portp->lp_grp);
-		portp->lp_lacp.wait_while_timer.id = 0;
+	mutex_enter(&pl->lacp_timer_lock);
+	if ((id = pl->wait_while_timer.id) != 0) {
+		pl->lacp_timer_bits &= ~LACP_WAIT_WHILE_TIMEOUT;
+		pl->wait_while_timer.id = 0;
 	}
+	mutex_exit(&pl->lacp_timer_lock);
+
+	if (id != 0)
+		(void) untimeout(id);
 }
 
 /*
@@ -1432,52 +1543,30 @@ aggr_lacp_port_attached(aggr_port_t *portp)
 	aggr_grp_t *grp = portp->lp_grp;
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	ASSERT(MAC_PERIM_HELD(portp->lp_mh));
 	ASSERT(portp->lp_state == AGGR_PORT_STATE_ATTACHED);
-	ASSERT(RW_WRITE_HELD(&portp->lp_lock));
 
 	AGGR_LACP_DBG(("aggr_lacp_port_attached: port %d\n",
 	    portp->lp_linkid));
 
 	portp->lp_lacp.sm.port_enabled = B_TRUE;	/* link on */
 
-	if (grp->lg_lacp_mode == AGGR_LACP_OFF) {
-		pl->ActorAdminPortState.bit.activity =
-		    pl->ActorOperPortState.bit.activity = B_FALSE;
-
-		/* Turn ON Collector_Distributor */
-		aggr_set_coll_dist_locked(portp, B_TRUE);
-
+	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 		return;
-	}
-
-	pl->ActorAdminPortState.bit.activity =
-	    pl->ActorOperPortState.bit.activity =
-	    (grp->lg_lacp_mode == AGGR_LACP_ACTIVE);
-
-	pl->ActorAdminPortState.bit.timeout =
-	    pl->ActorOperPortState.bit.timeout =
-	    (grp->aggr.PeriodicTimer == AGGR_LACP_TIMER_SHORT);
 
 	pl->sm.lacp_enabled = B_TRUE;
 	pl->ActorOperPortState.bit.aggregation = B_TRUE;
 	pl->sm.begin = B_TRUE;
 
-	if (!pl->sm.lacp_on) {
-		/* Turn OFF Collector_Distributor */
-		aggr_set_coll_dist_locked(portp, B_FALSE);
-
-		lacp_on(portp);
-	} else {
-		lacp_receive_sm(portp, NULL);
-		lacp_mux_sm(portp);
+	lacp_receive_sm(portp, NULL);
+	lacp_mux_sm(portp);
 
-		/* Enable Multicast Slow Protocol address */
-		aggr_lacp_mcast_on(portp);
+	/* Enable Multicast Slow Protocol address */
+	aggr_lacp_mcast_on(portp);
 
-		/* periodic_sm is started up from the receive machine */
-		lacp_selection_logic(portp);
-	}
+	/* periodic_sm is started up from the receive machine */
+	lacp_selection_logic(portp);
 }
 
 /*
@@ -1489,8 +1578,8 @@ aggr_lacp_port_detached(aggr_port_t *portp)
 {
 	aggr_grp_t *grp = portp->lp_grp;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
-	ASSERT(RW_WRITE_HELD(&portp->lp_lock));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	ASSERT(MAC_PERIM_HELD(portp->lp_mh));
 
 	AGGR_LACP_DBG(("aggr_lacp_port_detached: port %d\n",
 	    portp->lp_linkid));
@@ -1500,34 +1589,35 @@ aggr_lacp_port_detached(aggr_port_t *portp)
 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 		return;
 
-	/* Disable Slow Protocol PDUs */
-	lacp_off(portp);
-}
-
+	portp->lp_lacp.sm.lacp_enabled = B_FALSE;
+	lacp_selection_logic(portp);
+	lacp_mux_sm(portp);
+	lacp_periodic_sm(portp);
 
-/*
- * Invoked after the outbound port selection policy has been changed.
- */
-void
-aggr_lacp_policy_changed(aggr_grp_t *grp)
-{
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+	/*
+	 * Disable Slow Protocol Timers.
+	 */
+	stop_periodic_timer(portp);
+	stop_current_while_timer(portp);
+	stop_wait_while_timer(portp);
 
-	/* suspend transmission for CollectorMaxDelay time */
-	delay(grp->aggr.CollectorMaxDelay * 10);
+	/* Disable Multicast Slow Protocol address */
+	aggr_lacp_mcast_off(portp);
+	aggr_set_coll_dist(portp, B_FALSE);
 }
 
-
 /*
  * Enable Slow Protocol LACP and Marker PDUs.
  */
 static void
 lacp_on(aggr_port_t *portp)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
-	ASSERT(RW_WRITE_HELD(&portp->lp_grp->lg_lock));
-	ASSERT(RW_WRITE_HELD(&portp->lp_lock));
+	aggr_lacp_port_t *pl = &portp->lp_lacp;
+	mac_perim_handle_t mph;
+
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
+
+	mac_perim_enter_by_mh(portp->lp_mh, &mph);
 
 	/*
 	 * Reset the state machines and Partner operational
@@ -1535,67 +1625,69 @@ lacp_on(aggr_port_t *portp)
 	 * our link state.
 	 */
 	lacp_reset_port(portp);
-	portp->lp_lacp.sm.lacp_on = B_TRUE;
+	pl->sm.lacp_on = B_TRUE;
 
 	AGGR_LACP_DBG(("lacp_on:(%d): \n", portp->lp_linkid));
 
+	if (portp->lp_state == AGGR_PORT_STATE_ATTACHED) {
+		pl->sm.port_enabled = B_TRUE;
+		pl->sm.lacp_enabled = B_TRUE;
+		pl->ActorOperPortState.bit.aggregation = B_TRUE;
+	}
+
 	lacp_receive_sm(portp, NULL);
 	lacp_mux_sm(portp);
 
-	if (portp->lp_state != AGGR_PORT_STATE_ATTACHED)
-		return;
-
-	/* Enable Multicast Slow Protocol address */
-	aggr_lacp_mcast_on(portp);
+	if (portp->lp_state == AGGR_PORT_STATE_ATTACHED) {
+		/* Enable Multicast Slow Protocol address */
+		aggr_lacp_mcast_on(portp);
 
-	/* periodic_sm is started up from the receive machine */
-	lacp_selection_logic(portp);
+		/* periodic_sm is started up from the receive machine */
+		lacp_selection_logic(portp);
+	}
+done:
+	mac_perim_exit(mph);
 } /* lacp_on */
 
-
 /* Disable Slow Protocol LACP and Marker PDUs */
 static void
 lacp_off(aggr_port_t *portp)
 {
-	aggr_grp_t *grp = portp->lp_grp;
+	aggr_lacp_port_t *pl = &portp->lp_lacp;
+	mac_perim_handle_t mph;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
-	ASSERT(RW_WRITE_HELD(&portp->lp_lock));
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
+	mac_perim_enter_by_mh(portp->lp_mh, &mph);
 
-	portp->lp_lacp.sm.lacp_on = B_FALSE;
+	pl->sm.lacp_on = B_FALSE;
 
 	AGGR_LACP_DBG(("lacp_off:(%d): \n", portp->lp_linkid));
 
-	/*
-	 * Disable Slow Protocol Timers.  We must temporarily release
-	 * the group and port locks to avoid deadlocks. Make sure that
-	 * neither the port nor group are closing after re-acquiring
-	 * their locks.
-	 */
-	rw_exit(&portp->lp_lock);
-	rw_exit(&grp->lg_lock);
-
-	stop_periodic_timer(portp);
-	stop_current_while_timer(portp);
-	stop_wait_while_timer(portp);
+	if (portp->lp_state == AGGR_PORT_STATE_ATTACHED) {
+		/*
+		 * Disable Slow Protocol Timers.
+		 */
+		stop_periodic_timer(portp);
+		stop_current_while_timer(portp);
+		stop_wait_while_timer(portp);
 
-	rw_enter(&grp->lg_lock, RW_WRITER);
-	rw_enter(&portp->lp_lock, RW_WRITER);
+		/* Disable Multicast Slow Protocol address */
+		aggr_lacp_mcast_off(portp);
 
-	if (!portp->lp_closing && !grp->lg_closing) {
-		lacp_mux_sm(portp);
-		lacp_periodic_sm(portp);
-		lacp_selection_logic(portp);
+		pl->sm.port_enabled = B_FALSE;
+		pl->sm.lacp_enabled = B_FALSE;
+		pl->ActorOperPortState.bit.aggregation = B_FALSE;
 	}
 
-	/* Turn OFF Collector_Distributor */
-	aggr_set_coll_dist_locked(portp, B_FALSE);
+	lacp_mux_sm(portp);
+	lacp_periodic_sm(portp);
+	lacp_selection_logic(portp);
 
-	/* Disable Multicast Slow Protocol address */
-	aggr_lacp_mcast_off(portp);
+	/* Turn OFF Collector_Distributor */
+	aggr_set_coll_dist(portp, B_FALSE);
 
 	lacp_reset_port(portp);
+	mac_perim_exit(mph);
 }
 
 
@@ -1627,61 +1719,71 @@ valid_lacp_pdu(aggr_port_t *portp, lacp_t *lacp)
 static void
 start_current_while_timer(aggr_port_t *portp, uint_t time)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
-
-	if (portp->lp_lacp.current_while_timer.id == 0) {
-		if (time > 0) {
-			portp->lp_lacp.current_while_timer.val = time;
-		} else if (portp->lp_lacp.ActorOperPortState.bit.timeout) {
-			portp->lp_lacp.current_while_timer.val =
-			    SHORT_TIMEOUT_TIME;
-		} else {
-			portp->lp_lacp.current_while_timer.val =
-			    LONG_TIMEOUT_TIME;
-		}
+	aggr_lacp_port_t *pl = &portp->lp_lacp;
+
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
+
+	mutex_enter(&pl->lacp_timer_lock);
+	if (pl->current_while_timer.id == 0) {
+		if (time > 0)
+			pl->current_while_timer.val = time;
+		else if (pl->ActorOperPortState.bit.timeout)
+			pl->current_while_timer.val = SHORT_TIMEOUT_TIME;
+		else
+			pl->current_while_timer.val = LONG_TIMEOUT_TIME;
 
-		portp->lp_lacp.current_while_timer.id =
+		pl->current_while_timer.id =
 		    timeout(current_while_timer_pop, portp,
 		    drv_usectohz((clock_t)1000000 *
 		    (clock_t)portp->lp_lacp.current_while_timer.val));
 	}
+	mutex_exit(&pl->lacp_timer_lock);
 }
 
 
 static void
 stop_current_while_timer(aggr_port_t *portp)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	aggr_lacp_port_t *pl = &portp->lp_lacp;
+	timeout_id_t id;
+
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
-	if (portp->lp_lacp.current_while_timer.id != 0) {
-		AGGR_LACP_UNLOCK(portp->lp_grp);
-		(void) untimeout(portp->lp_lacp.current_while_timer.id);
-		AGGR_LACP_LOCK_WRITER(portp->lp_grp);
-		portp->lp_lacp.current_while_timer.id = 0;
+	mutex_enter(&pl->lacp_timer_lock);
+	if ((id = pl->current_while_timer.id) != 0) {
+		pl->lacp_timer_bits &= ~LACP_CURRENT_WHILE_TIMEOUT;
+		pl->current_while_timer.id = 0;
 	}
-}
+	mutex_exit(&pl->lacp_timer_lock);
 
+	if (id != 0)
+		(void) untimeout(id);
+}
 
 static void
 current_while_timer_pop(void *data)
 {
 	aggr_port_t *portp = (aggr_port_t *)data;
+	aggr_lacp_port_t *pl = &portp->lp_lacp;
 
-	if (portp->lp_closing)
-		return;
+	mutex_enter(&pl->lacp_timer_lock);
+	pl->lacp_timer_bits |= LACP_CURRENT_WHILE_TIMEOUT;
+	cv_broadcast(&pl->lacp_timer_cv);
+	mutex_exit(&pl->lacp_timer_lock);
+}
 
-	AGGR_LACP_LOCK_WRITER(portp->lp_grp);
+static void
+current_while_timer_pop_handler(aggr_port_t *portp)
+{
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
 	AGGR_LACP_DBG(("trunk link:(%d): current_while_timer "
 	    "pop id=%p\n", portp->lp_linkid,
 	    portp->lp_lacp.current_while_timer.id));
 
-	portp->lp_lacp.current_while_timer.id = 0;
 	lacp_receive_sm(portp, NULL);
-	AGGR_LACP_UNLOCK(portp->lp_grp);
 }
 
-
 /*
  * record_Default - Simply copies over administrative values
  * to the partner operational values, and sets our state to indicate we
@@ -1692,7 +1794,7 @@ record_Default(aggr_port_t *portp)
 {
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
 	pl->PartnerOperPortNum = pl->PartnerAdminPortNum;
 	pl->PartnerOperPortPriority = pl->PartnerAdminPortPriority;
@@ -1713,7 +1815,7 @@ record_PDU(aggr_port_t *portp, lacp_t *lacp)
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 	uint8_t save_sync;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	ASSERT(MAC_PERIM_HELD(aggrp->lg_mh));
 
 	/*
 	 * Partner Information
@@ -1780,7 +1882,7 @@ update_selected(aggr_port_t *portp, lacp_t *lacp)
 {
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
 	if ((pl->PartnerOperPortNum != ntohs(lacp->actor_info.port)) ||
 	    (pl->PartnerOperPortPriority !=
@@ -1814,7 +1916,7 @@ update_default_selected(aggr_port_t *portp)
 {
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
 	if ((pl->PartnerAdminPortNum != pl->PartnerOperPortNum) ||
 	    (pl->PartnerOperPortPriority != pl->PartnerAdminPortPriority) ||
@@ -1844,7 +1946,7 @@ update_NTT(aggr_port_t *portp, lacp_t *lacp)
 	aggr_grp_t *aggrp = portp->lp_grp;
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	ASSERT(MAC_PERIM_HELD(aggrp->lg_mh));
 
 	if ((pl->ActorPortNumber != ntohs(lacp->partner_info.port)) ||
 	    (pl->ActorPortPriority !=
@@ -1890,7 +1992,7 @@ lacp_receive_sm(aggr_port_t *portp, lacp_t *lacp)
 	aggr_lacp_port_t *pl = &portp->lp_lacp;
 	lacp_receive_state_t oldstate = pl->sm.receive_state;
 
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
+	ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh));
 
 	/* LACP_OFF state not in specification so check here.  */
 	if (!pl->sm.lacp_on)
@@ -1918,7 +2020,6 @@ lacp_receive_sm(aggr_port_t *portp, lacp_t *lacp)
 		pl->sm.receive_state = LACP_DEFAULTED;
 	}
 
-
 	if (!((lacp && (oldstate == LACP_CURRENT) &&
 	    (pl->sm.receive_state == LACP_CURRENT)))) {
 		AGGR_LACP_DBG(("lacp_receive_sm(%d):%s--->%s\n",
@@ -2068,28 +2169,19 @@ lacp_receive_sm(aggr_port_t *portp, lacp_t *lacp)
 static void
 aggr_set_coll_dist(aggr_port_t *portp, boolean_t enable)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
-	rw_enter(&portp->lp_lock, RW_WRITER);
-	aggr_set_coll_dist_locked(portp, enable);
-	rw_exit(&portp->lp_lock);
-}
-
-static void
-aggr_set_coll_dist_locked(aggr_port_t *portp, boolean_t enable)
-{
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp));
-	ASSERT(RW_WRITE_HELD(&portp->lp_lock));
+	mac_perim_handle_t mph;
 
 	AGGR_LACP_DBG(("AGGR_SET_COLL_DIST_TYPE: (%d) %s\n",
 	    portp->lp_linkid, enable ? "ENABLED" : "DISABLED"));
 
+	mac_perim_enter_by_mh(portp->lp_mh, &mph);
 	if (!enable) {
 		/*
 		 * Turn OFF Collector_Distributor.
 		 */
 		portp->lp_collector_enabled = B_FALSE;
 		aggr_send_port_disable(portp);
-		return;
+		goto done;
 	}
 
 	/*
@@ -2102,14 +2194,21 @@ aggr_set_coll_dist_locked(aggr_port_t *portp, boolean_t enable)
 		portp->lp_collector_enabled = B_TRUE;
 		aggr_send_port_enable(portp);
 	}
+
+done:
+	mac_perim_exit(mph);
 }
 
 /*
- * Process a received Marker or LACPDU.
+ * Because the LACP packet processing needs to enter the aggr's mac perimeter
+ * and that would potentially cause a deadlock with the thread in which the
+ * grp/port is deleted, we defer the packet process to a worker thread. Here
+ * we only enqueue the received Marker or LACPDU for later processing.
  */
 void
-aggr_lacp_rx(aggr_port_t *portp, mblk_t *dmp)
+aggr_lacp_rx_enqueue(aggr_port_t *portp, mblk_t *dmp)
 {
+	aggr_grp_t *grp = portp->lp_grp;
 	lacp_t	*lacp;
 
 	dmp->b_rptr += sizeof (struct ether_header);
@@ -2120,34 +2219,143 @@ aggr_lacp_rx(aggr_port_t *portp, mblk_t *dmp)
 	}
 
 	lacp = (lacp_t *)dmp->b_rptr;
+	if (lacp->subtype != LACP_SUBTYPE && lacp->subtype != MARKER_SUBTYPE) {
+		AGGR_LACP_DBG(("aggr_lacp_rx_enqueue: (%d): "
+		    "Unknown Slow Protocol type %d\n",
+		    portp->lp_linkid, lacp->subtype));
+		freemsg(dmp);
+		return;
+	}
+
+	mutex_enter(&grp->lg_lacp_lock);
+
+	/*
+	 * If the lg_lacp_done is set, this aggregation is in the process of
+	 * being deleted, return directly.
+	 */
+	if (grp->lg_lacp_done) {
+		mutex_exit(&grp->lg_lacp_lock);
+		freemsg(dmp);
+		return;
+	}
+
+	if (grp->lg_lacp_tail == NULL) {
+		grp->lg_lacp_head = grp->lg_lacp_tail = dmp;
+	} else {
+		grp->lg_lacp_tail->b_next = dmp;
+		grp->lg_lacp_tail = dmp;
+	}
+
+	/*
+	 * Hold a reference of the port so that the port won't be freed when it
+	 * is removed from the aggr. The b_prev field is borrowed to save the
+	 * port information.
+	 */
+	AGGR_PORT_REFHOLD(portp);
+	dmp->b_prev = (mblk_t *)portp;
+	cv_broadcast(&grp->lg_lacp_cv);
+	mutex_exit(&grp->lg_lacp_lock);
+}
 
+static void
+aggr_lacp_rx(mblk_t *dmp)
+{
+	aggr_port_t *portp = (aggr_port_t *)dmp->b_prev;
+	mac_perim_handle_t mph;
+	lacp_t	*lacp;
+
+	dmp->b_prev = NULL;
+
+	mac_perim_enter_by_mh(portp->lp_grp->lg_mh, &mph);
+	if (portp->lp_closing)
+		goto done;
+
+	lacp = (lacp_t *)dmp->b_rptr;
 	switch (lacp->subtype) {
 	case LACP_SUBTYPE:
 		AGGR_LACP_DBG(("aggr_lacp_rx:(%d): LACPDU received.\n",
 		    portp->lp_linkid));
 
-		AGGR_LACP_LOCK_WRITER(portp->lp_grp);
 		if (!portp->lp_lacp.sm.lacp_on) {
-			AGGR_LACP_UNLOCK(portp->lp_grp);
 			break;
 		}
 		lacp_receive_sm(portp, lacp);
-		AGGR_LACP_UNLOCK(portp->lp_grp);
 		break;
 
 	case MARKER_SUBTYPE:
 		AGGR_LACP_DBG(("aggr_lacp_rx:(%d): Marker Packet received.\n",
 		    portp->lp_linkid));
 
-		(void) receive_marker_pdu(portp, dmp);
-		break;
+		if (receive_marker_pdu(portp, dmp) != 0)
+			break;
 
-	default:
-		AGGR_LACP_DBG(("aggr_lacp_rx: (%d): "
-		    "Unknown Slow Protocol type %d\n",
-		    portp->lp_linkid, lacp->subtype));
-		break;
+		(void) mac_tx(portp->lp_mch, dmp, 0, MAC_DROP_ON_NO_DESC, NULL);
+		mac_perim_exit(mph);
+		AGGR_PORT_REFRELE(portp);
+		return;
 	}
 
+done:
+	mac_perim_exit(mph);
+	AGGR_PORT_REFRELE(portp);
 	freemsg(dmp);
 }
+
+void
+aggr_lacp_rx_thread(void *arg)
+{
+	callb_cpr_t	cprinfo;
+	aggr_grp_t	*grp = (aggr_grp_t *)arg;
+	aggr_port_t	*port;
+	mblk_t		*mp, *nextmp;
+
+	CALLB_CPR_INIT(&cprinfo, &grp->lg_lacp_lock, callb_generic_cpr,
+	    "aggr_lacp_rx_thread");
+
+	mutex_enter(&grp->lg_lacp_lock);
+
+	/*
+	 * Quit the thread if the grp is deleted.
+	 */
+	while (!grp->lg_lacp_done) {
+		if ((mp = grp->lg_lacp_head) == NULL) {
+			CALLB_CPR_SAFE_BEGIN(&cprinfo);
+			cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
+			CALLB_CPR_SAFE_END(&cprinfo, &grp->lg_lacp_lock);
+			continue;
+		}
+
+		grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
+		mutex_exit(&grp->lg_lacp_lock);
+
+		while (mp != NULL) {
+			nextmp = mp->b_next;
+			mp->b_next = NULL;
+			aggr_lacp_rx(mp);
+			mp = nextmp;
+		}
+		mutex_enter(&grp->lg_lacp_lock);
+	}
+
+	/*
+	 * The grp is being destroyed, simply free all of the LACP messages
+	 * left in the queue which did not have the chance to be processed.
+	 * We cannot use freemsgchain() here since we need to clear the
+	 * b_prev field.
+	 */
+	while ((mp = grp->lg_lacp_head) != NULL) {
+		port = (aggr_port_t *)mp->b_prev;
+		AGGR_PORT_REFRELE(port);
+		nextmp = mp->b_next;
+		mp->b_next = NULL;
+		mp->b_prev = NULL;
+		freemsg(mp);
+		mp = nextmp;
+	}
+
+	grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
+	grp->lg_lacp_rx_thread = NULL;
+	cv_broadcast(&grp->lg_lacp_cv);
+	CALLB_CPR_EXIT(&cprinfo);
+	thread_exit();
+}
diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c
index cad61f559f..a84c4a5c2a 100644
--- a/usr/src/uts/common/io/aggr/aggr_port.c
+++ b/usr/src/uts/common/io/aggr/aggr_port.c
@@ -46,6 +46,7 @@
 #include <sys/stat.h>
 #include <sys/sdt.h>
 #include <sys/dlpi.h>
+#include <sys/dls.h>
 #include <sys/aggr.h>
 #include <sys/aggr_impl.h>
 
@@ -58,11 +59,7 @@ static void aggr_port_notify_cb(void *, mac_notify_type_t);
 static int
 aggr_port_constructor(void *buf, void *arg, int kmflag)
 {
-	aggr_port_t *port = buf;
-
 	bzero(buf, sizeof (aggr_port_t));
-	rw_init(&port->lp_lock, NULL, RW_DRIVER, NULL);
-
 	return (0);
 }
 
@@ -72,7 +69,10 @@ aggr_port_destructor(void *buf, void *arg)
 {
 	aggr_port_t *port = buf;
 
-	rw_destroy(&port->lp_lock);
+	ASSERT(port->lp_mnh == NULL);
+	ASSERT(port->lp_mphp == NULL);
+	ASSERT(!port->lp_grp_added);
+	ASSERT(port->lp_hwgh == NULL);
 }
 
 void
@@ -103,31 +103,37 @@ aggr_port_fini(void)
 	id_space_destroy(aggr_portids);
 }
 
-mac_resource_handle_t
-aggr_port_resource_add(void *arg, mac_resource_t *mrp)
-{
-	aggr_port_t *port = (aggr_port_t *)arg;
-	aggr_grp_t *grp = port->lp_grp;
-
-	return (mac_resource_add(grp->lg_mh, mrp));
-}
-
+/* ARGSUSED */
 void
 aggr_port_init_callbacks(aggr_port_t *port)
 {
 	/* add the port's receive callback */
-	port->lp_mnh = mac_notify_add(port->lp_mh, aggr_port_notify_cb,
-	    (void *)port);
-
-	/* set port's resource_add callback */
-	mac_resource_set(port->lp_mh, aggr_port_resource_add, (void *)port);
+	port->lp_mnh = mac_notify_add(port->lp_mh, aggr_port_notify_cb, port);
+	/*
+	 * Hold a reference of the grp and the port and this reference will
+	 * be release when the thread exits.
+	 *
+	 * The reference on the port is used for aggr_port_delete() to
+	 * continue without waiting for the thread to exit; the reference
+	 * on the grp is used for aggr_grp_delete() to wait for the thread
+	 * to exit before calling mac_unregister().
+	 *
+	 * Note that these references will be released either in
+	 * aggr_port_delete() when mac_notify_remove() succeeds, or in
+	 * the aggr_port_notify_cb() callback when the port is deleted
+	 * (lp_closing is set).
+	 */
+	aggr_grp_port_hold(port);
 }
 
+/* ARGSUSED */
 int
-aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
+aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force,
+    aggr_port_t **pp)
 {
 	int err;
 	mac_handle_t mh;
+	mac_client_handle_t mch = NULL;
 	aggr_port_t *port;
 	uint16_t portid;
 	uint_t i;
@@ -135,6 +141,11 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
 	const mac_info_t *mip;
 	uint32_t note;
 	uint32_t margin;
+	char client_name[MAXNAMELEN];
+	char aggr_name[MAXNAMELEN];
+	char port_name[MAXNAMELEN];
+	mac_diag_t diag;
+	mac_unicast_handle_t mah;
 
 	*pp = NULL;
 
@@ -165,6 +176,20 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
 		}
 	}
 
+	if (((err = dls_mgmt_get_linkinfo(grp->lg_linkid,
+	    aggr_name, NULL, NULL, NULL)) != 0) ||
+	    ((err = dls_mgmt_get_linkinfo(linkid, port_name,
+	    NULL, NULL, NULL)) != 0)) {
+		goto fail;
+	}
+
+	(void) snprintf(client_name, MAXNAMELEN, "%s-%s", aggr_name, port_name);
+	if ((err = mac_client_open(mh, &mch, client_name,
+	    MAC_OPEN_FLAGS_IS_AGGR_PORT | MAC_OPEN_FLAGS_EXCLUSIVE |
+	    MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK)) != 0) {
+		goto fail;
+	}
+
 	if ((portid = (uint16_t)id_alloc(aggr_portids)) == 0) {
 		err = ENOMEM;
 		goto fail;
@@ -180,10 +205,9 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
 		goto fail;
 	}
 
-	if (!mac_active_set(mh)) {
+	if ((err = mac_unicast_primary_add(mch, &mah, &diag)) != 0) {
 		VERIFY(mac_margin_remove(mh, margin) == 0);
 		id_free(aggr_portids, portid);
-		err = EBUSY;
 		goto fail;
 	}
 
@@ -192,15 +216,14 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
 	port->lp_refs = 1;
 	port->lp_next = NULL;
 	port->lp_mh = mh;
+	port->lp_mch = mch;
 	port->lp_mip = mip;
 	port->lp_linkid = linkid;
-	port->lp_closing = 0;
+	port->lp_closing = B_FALSE;
+	port->lp_mah = mah;
 
 	/* get the port's original MAC address */
-	mac_unicst_get(port->lp_mh, port->lp_addr);
-
-	/* set port's transmit information */
-	port->lp_txinfo = mac_tx_get(port->lp_mh);
+	mac_unicast_primary_get(port->lp_mh, port->lp_addr);
 
 	/* initialize state */
 	port->lp_state = AGGR_PORT_STATE_STANDBY;
@@ -213,6 +236,7 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
 	port->lp_no_link_update = no_link_update;
 	port->lp_portid = portid;
 	port->lp_margin = margin;
+	port->lp_prom_addr = NULL;
 
 	/*
 	 * Save the current statistics of the port. They will be used
@@ -235,6 +259,8 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp)
 	return (0);
 
 fail:
+	if (mch != NULL)
+		mac_client_close(mch, MAC_CLOSE_FLAGS_EXCLUSIVE);
 	mac_close(mh);
 	return (err);
 }
@@ -242,19 +268,48 @@ fail:
 void
 aggr_port_delete(aggr_port_t *port)
 {
+	aggr_lacp_port_t *pl = &port->lp_lacp;
+
+	ASSERT(port->lp_mphp == NULL);
+	ASSERT(!port->lp_promisc_on);
+
+	port->lp_closing = B_TRUE;
+
 	VERIFY(mac_margin_remove(port->lp_mh, port->lp_margin) == 0);
-	mac_rx_remove_wait(port->lp_mh);
-	mac_resource_set(port->lp_mh, NULL, NULL);
-	mac_notify_remove(port->lp_mh, port->lp_mnh);
-	mac_active_clear(port->lp_mh);
+	mac_rx_clear(port->lp_mch);
+	/*
+	 * If the notification callback is already in process and waiting for
+	 * the aggr grp's mac perimeter, don't wait (otherwise there would be
+	 * deadlock). Otherwise, if mac_notify_remove() succeeds, we can
+	 * release the reference held when mac_notify_add() is called.
+	 */
+	if ((port->lp_mnh != NULL) &&
+	    (mac_notify_remove(port->lp_mnh, B_FALSE) == 0)) {
+		aggr_grp_port_rele(port);
+	}
+	port->lp_mnh = NULL;
+
+	/*
+	 * Inform the the port lacp timer thread to exit. Note that waiting
+	 * for the thread to exit may cause deadlock since that thread may
+	 * need to enter into the mac perimeter which we are currently in.
+	 * It is fine to continue without waiting though since that thread
+	 * is holding a reference of the port.
+	 */
+	mutex_enter(&pl->lacp_timer_lock);
+	pl->lacp_timer_bits |= LACP_THREAD_EXIT;
+	cv_broadcast(&pl->lacp_timer_cv);
+	mutex_exit(&pl->lacp_timer_lock);
 
 	/*
 	 * Restore the port MAC address. Note it is called after the
 	 * port's notification callback being removed. This prevent
 	 * port's MAC_NOTE_UNICST notify callback function being called.
 	 */
-	(void) mac_unicst_set(port->lp_mh, port->lp_addr);
+	(void) mac_unicast_primary_set(port->lp_mh, port->lp_addr);
 
+	(void) mac_unicast_remove(port->lp_mch, port->lp_mah);
+	mac_client_close(port->lp_mch, MAC_CLOSE_FLAGS_EXCLUSIVE);
 	mac_close(port->lp_mh);
 	AGGR_PORT_REFRELE(port);
 }
@@ -268,6 +323,8 @@ aggr_port_free(aggr_port_t *port)
 	port->lp_grp = NULL;
 	id_free(aggr_portids, port->lp_portid);
 	port->lp_portid = 0;
+	mutex_destroy(&port->lp_lacp.lacp_timer_lock);
+	cv_destroy(&port->lp_lacp.lacp_timer_cv);
 	kmem_cache_free(aggr_port_cache, port);
 }
 
@@ -276,7 +333,7 @@ aggr_port_free(aggr_port_t *port)
  * one of the constituent ports.
  */
 boolean_t
-aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port, boolean_t dolock)
+aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port)
 {
 	boolean_t do_attach = B_FALSE;
 	boolean_t do_detach = B_FALSE;
@@ -284,16 +341,10 @@ aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port, boolean_t dolock)
 	uint64_t ifspeed;
 	link_state_t link_state;
 	link_duplex_t link_duplex;
+	mac_perim_handle_t mph;
 
-	if (dolock) {
-		AGGR_LACP_LOCK_WRITER(grp);
-		rw_enter(&grp->lg_lock, RW_WRITER);
-	} else {
-		ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
-		ASSERT(RW_WRITE_HELD(&grp->lg_lock));
-	}
-
-	rw_enter(&port->lp_lock, RW_WRITER);
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	mac_perim_enter_by_mh(port->lp_mh, &mph);
 
 	/*
 	 * link state change?  For links that do not support link state
@@ -334,15 +385,10 @@ aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port, boolean_t dolock)
 		link_state_changed = aggr_grp_attach_port(grp, port);
 	} else if (do_detach) {
 		/* detach the port from the aggregation */
-		link_state_changed = aggr_grp_detach_port(grp, port, B_TRUE);
+		link_state_changed = aggr_grp_detach_port(grp, port);
 	}
 
-	rw_exit(&port->lp_lock);
-
-	if (dolock) {
-		rw_exit(&grp->lg_lock);
-		AGGR_LACP_UNLOCK(grp);
-	}
+	mac_perim_exit(mph);
 	return (link_state_changed);
 }
 
@@ -357,21 +403,20 @@ aggr_port_notify_unicst(aggr_grp_t *grp, aggr_port_t *port,
 	boolean_t mac_addr_changed = B_FALSE;
 	boolean_t link_state_changed = B_FALSE;
 	uint8_t mac_addr[ETHERADDRL];
+	mac_perim_handle_t mph;
 
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 	ASSERT(mac_addr_changedp != NULL);
 	ASSERT(link_state_changedp != NULL);
-	AGGR_LACP_LOCK_WRITER(grp);
-	rw_enter(&grp->lg_lock, RW_WRITER);
-
-	rw_enter(&port->lp_lock, RW_WRITER);
+	mac_perim_enter_by_mh(port->lp_mh, &mph);
 
 	/*
 	 * If it is called when setting the MAC address to the
 	 * aggregation group MAC address, do nothing.
 	 */
-	mac_unicst_get(port->lp_mh, mac_addr);
+	mac_unicast_primary_get(port->lp_mh, mac_addr);
 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
-		rw_exit(&port->lp_lock);
+		mac_perim_exit(mph);
 		goto done;
 	}
 
@@ -381,10 +426,7 @@ aggr_port_notify_unicst(aggr_grp_t *grp, aggr_port_t *port,
 	aggr_grp_port_mac_changed(grp, port, &mac_addr_changed,
 	    &link_state_changed);
 
-	rw_exit(&port->lp_lock);
-
-	if (grp->lg_closing)
-		goto done;
+	mac_perim_exit(mph);
 
 	/*
 	 * If this port was used to determine the MAC address of
@@ -397,8 +439,6 @@ aggr_port_notify_unicst(aggr_grp_t *grp, aggr_port_t *port,
 done:
 	*mac_addr_changedp = mac_addr_changed;
 	*link_state_changedp = link_state_changed;
-	rw_exit(&grp->lg_lock);
-	AGGR_LACP_UNLOCK(grp);
 }
 
 /*
@@ -411,22 +451,26 @@ aggr_port_notify_cb(void *arg, mac_notify_type_t type)
 	aggr_port_t *port = arg;
 	aggr_grp_t *grp = port->lp_grp;
 	boolean_t mac_addr_changed, link_state_changed;
+	mac_perim_handle_t mph;
 
-	/*
-	 * Do nothing if the aggregation or the port is in the deletion
-	 * process. Note that this is necessary to avoid deadlock.
-	 */
-	if ((grp->lg_closing) || (port->lp_closing))
-		return;
+	mac_perim_enter_by_mh(grp->lg_mh, &mph);
+	if (port->lp_closing) {
+		mac_perim_exit(mph);
 
-	AGGR_PORT_REFHOLD(port);
+		/*
+		 * Release the reference so it is safe for aggr to call
+		 * mac_unregister() now.
+		 */
+		aggr_grp_port_rele(port);
+		return;
+	}
 
 	switch (type) {
 	case MAC_NOTE_TX:
 		mac_tx_update(grp->lg_mh);
 		break;
 	case MAC_NOTE_LINK:
-		if (aggr_port_notify_link(grp, port, B_TRUE))
+		if (aggr_port_notify_link(grp, port))
 			mac_link_update(grp->lg_mh, grp->lg_link_state);
 		break;
 	case MAC_NOTE_UNICST:
@@ -437,46 +481,34 @@ aggr_port_notify_cb(void *arg, mac_notify_type_t type)
 		if (link_state_changed)
 			mac_link_update(grp->lg_mh, grp->lg_link_state);
 		break;
-	case MAC_NOTE_PROMISC:
-		port->lp_txinfo = mac_tx_get(port->lp_mh);
-		break;
 	default:
 		break;
 	}
 
-	AGGR_PORT_REFRELE(port);
+	mac_perim_exit(mph);
 }
 
 int
 aggr_port_start(aggr_port_t *port)
 {
-	int rc;
-
-	ASSERT(RW_WRITE_HELD(&port->lp_lock));
-
-	if (port->lp_started)
-		return (0);
-
-	if ((rc = mac_start(port->lp_mh)) != 0)
-		return (rc);
+	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 
-	/* update the port state */
-	port->lp_started = B_TRUE;
+	if (!port->lp_started)
+		port->lp_started = B_TRUE;
 
-	return (rc);
+	return (0);
 }
 
 void
 aggr_port_stop(aggr_port_t *port)
 {
-	ASSERT(RW_WRITE_HELD(&port->lp_lock));
+	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 
 	if (!port->lp_started)
 		return;
 
-	aggr_grp_multicst_port(port, B_FALSE);
-
-	mac_stop(port->lp_mh);
+	if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
+		aggr_grp_multicst_port(port, B_FALSE);
 
 	/* update the port state */
 	port->lp_started = B_FALSE;
@@ -487,33 +519,46 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on)
 {
 	int rc;
 
-	ASSERT(RW_WRITE_HELD(&port->lp_lock));
+	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 
 	if (on == port->lp_promisc_on)
 		/* already in desired promiscous mode */
 		return (0);
 
-	rc = mac_promisc_set(port->lp_mh, on, MAC_DEVPROMISC);
+	if (on) {
+		mac_rx_clear(port->lp_mch);
+		rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL,
+		    aggr_recv_cb, port, &port->lp_mphp,
+		    MAC_PROMISC_FLAGS_NO_TX_LOOP);
+		if (rc != 0) {
+			mac_rx_set(port->lp_mch, aggr_recv_cb, port);
+			return (rc);
+		}
+	} else {
+		rc = mac_promisc_remove(port->lp_mphp);
+		if (rc != 0)
+			return (rc);
+		port->lp_mphp = NULL;
+		mac_rx_set(port->lp_mch, aggr_recv_cb, port);
+	}
 
-	if (rc == 0)
-		port->lp_promisc_on = on;
+	port->lp_promisc_on = on;
 
-	return (rc);
+	return (0);
 }
 
 /*
  * Set the MAC address of a port.
  */
 int
-aggr_port_unicst(aggr_port_t *port, uint8_t *macaddr)
+aggr_port_unicst(aggr_port_t *port)
 {
-	int rc;
-
-	ASSERT(RW_WRITE_HELD(&port->lp_lock));
+	aggr_grp_t		*grp = port->lp_grp;
 
-	rc = mac_unicst_set(port->lp_mh, macaddr);
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 
-	return (rc);
+	return (mac_unicast_primary_set(port->lp_mh, grp->lg_addr));
 }
 
 /*
@@ -524,8 +569,12 @@ aggr_port_multicst(void *arg, boolean_t add, const uint8_t *addrp)
 {
 	aggr_port_t *port = arg;
 
-	return (add ? mac_multicst_add(port->lp_mh, addrp) :
-	    mac_multicst_remove(port->lp_mh, addrp));
+	if (add) {
+		return (mac_multicast_add(port->lp_mch, addrp));
+	} else {
+		mac_multicast_remove(port->lp_mch, addrp);
+		return (0);
+	}
 }
 
 uint64_t
@@ -533,3 +582,101 @@ aggr_port_stat(aggr_port_t *port, uint_t stat)
 {
 	return (mac_stat_get(port->lp_mh, stat));
 }
+
+/*
+ * Add a non-primary unicast address to the underlying port. If the port
+ * supports HW Rx group, try to add the address into the HW Rx group of
+ * the port first. If that fails, or if the port does not support HW Rx
+ * group, enable the port's promiscous mode.
+ */
+int
+aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr)
+{
+	aggr_unicst_addr_t	*addr, **pprev;
+	mac_perim_handle_t	pmph;
+	int			err;
+
+	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
+	mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+	/*
+	 * If the underlying port support HW Rx group, add the mac to its
+	 * RX group directly.
+	 */
+	if ((port->lp_hwgh != NULL) &&
+	    ((mac_hwgroup_addmac(port->lp_hwgh, mac_addr)) == 0)) {
+		mac_perim_exit(pmph);
+		return (0);
+	}
+
+	/*
+	 * If that fails, or if the port does not support HW Rx group, enable
+	 * the port's promiscous mode. (Note that we turn on the promiscous
+	 * mode only if the port is already started.
+	 */
+	if (port->lp_started &&
+	    ((err = aggr_port_promisc(port, B_TRUE)) != 0)) {
+		mac_perim_exit(pmph);
+		return (err);
+	}
+
+	/*
+	 * Walk through the unicast addresses that requires promiscous mode
+	 * enabled on this port, and add this address to the end of the list.
+	 */
+	pprev = &port->lp_prom_addr;
+	while ((addr = *pprev) != NULL) {
+		ASSERT(bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0);
+		pprev = &addr->aua_next;
+	}
+	addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
+	bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
+	addr->aua_next = NULL;
+	*pprev = addr;
+	mac_perim_exit(pmph);
+	return (0);
+}
+
+/*
+ * Remove a non-primary unicast address from the underlying port. This address
+ * must has been added by aggr_port_addmac(). As a result, we probably need to
+ * remove the address from the port's HW Rx group, or to disable the port's
+ * promiscous mode.
+ */
+void
+aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr)
+{
+	aggr_grp_t		*grp = port->lp_grp;
+	aggr_unicst_addr_t	*addr, **pprev;
+	mac_perim_handle_t	pmph;
+
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	mac_perim_enter_by_mh(port->lp_mh, &pmph);
+
+	/*
+	 * See whether this address is in the list of addresses that requires
+	 * the port being promiscous mode.
+	 */
+	pprev = &port->lp_prom_addr;
+	while ((addr = *pprev) != NULL) {
+		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0)
+			break;
+		pprev = &addr->aua_next;
+	}
+	if (addr != NULL) {
+		/*
+		 * This unicast address put the port into the promiscous mode,
+		 * delete this address from the lp_prom_addr list. If this is
+		 * the last address in that list, disable the promiscous mode
+		 * if the aggregation is not in promiscous mode.
+		 */
+		*pprev = addr->aua_next;
+		kmem_free(addr, sizeof (aggr_unicst_addr_t));
+		if (port->lp_prom_addr == NULL && !grp->lg_promisc)
+			(void) aggr_port_promisc(port, B_FALSE);
+	} else {
+		ASSERT(port->lp_hwgh != NULL);
+		(void) mac_hwgroup_remmac(port->lp_hwgh, mac_addr);
+	}
+	mac_perim_exit(pmph);
+}
diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c
index bf98e65ee3..2bdb7872e3 100644
--- a/usr/src/uts/common/io/aggr/aggr_recv.c
+++ b/usr/src/uts/common/io/aggr/aggr_recv.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * IEEE 802.3ad Link Aggregation - Receive
  *
@@ -42,7 +40,18 @@
 #include <sys/aggr_impl.h>
 
 static void
-aggr_recv_lacp(aggr_port_t *port, mblk_t *mp)
+aggr_mac_rx(mac_handle_t lg_mh, mac_resource_handle_t mrh, mblk_t *mp)
+{
+	if (mrh == NULL) {
+		mac_rx(lg_mh, mrh, mp);
+	} else {
+		aggr_pseudo_rx_ring_t	*ring = (aggr_pseudo_rx_ring_t *)mrh;
+		mac_rx_ring(lg_mh, ring->arr_rh, mp, ring->arr_gen);
+	}
+}
+
+void
+aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp)
 {
 	aggr_grp_t *grp = port->lp_grp;
 
@@ -51,35 +60,26 @@ aggr_recv_lacp(aggr_port_t *port, mblk_t *mp)
 		mblk_t *nmp = copymsg(mp);
 
 		if (nmp != NULL)
-			mac_rx(grp->lg_mh, NULL, nmp);
+			aggr_mac_rx(grp->lg_mh, mrh, nmp);
 	}
 
-	aggr_lacp_rx(port, mp);
+	aggr_lacp_rx_enqueue(port, mp);
 }
 
 /*
  * Callback function invoked by MAC service module when packets are
  * made available by a MAC port.
  */
+/* ARGSUSED */
 void
-aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
 {
 	aggr_port_t *port = (aggr_port_t *)arg;
 	aggr_grp_t *grp = port->lp_grp;
 
-	/*
-	 * If this message is looped back from the legacy devices, drop
-	 * it as the Nemo framework will be responsible for looping it
-	 * back by the mac_txloop() function.
-	 */
-	if (mp->b_flag & MSGNOLOOP) {
-		ASSERT(mp->b_next == NULL);
-		freemsg(mp);
-		return;
-	}
-
 	if (grp->lg_lacp_mode == AGGR_LACP_OFF) {
-		mac_rx(grp->lg_mh, mrh, mp);
+		aggr_mac_rx(grp->lg_mh, mrh, mp);
 	} else {
 		mblk_t *cmp, *last, *head;
 		struct ether_header *ehp;
@@ -100,10 +100,12 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
 				} else {
 					/* send up accumulated packets */
 					last->b_next = NULL;
-					if (port->lp_collector_enabled)
-						mac_rx(grp->lg_mh, mrh, head);
-					else
+					if (port->lp_collector_enabled) {
+						aggr_mac_rx(grp->lg_mh, mrh,
+						    head);
+					} else {
 						freemsgchain(head);
+					}
 					head = cmp->b_next;
 					cmp->b_next = NULL;
 					freemsg(cmp);
@@ -126,21 +128,23 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
 					ASSERT(last == NULL);
 					head = cmp->b_next;
 					cmp->b_next = NULL;
-					aggr_recv_lacp(port, cmp);
+					aggr_recv_lacp(port, mrh, cmp);
 					cmp = head;
 				} else {
 					/* previously accumulated packets */
 					ASSERT(last != NULL);
 					/* send up non-LACP packets */
 					last->b_next = NULL;
-					if (port->lp_collector_enabled)
-						mac_rx(grp->lg_mh, mrh, head);
-					else
+					if (port->lp_collector_enabled) {
+						aggr_mac_rx(grp->lg_mh, mrh,
+						    head);
+					} else {
 						freemsgchain(head);
+					}
 					/* unlink and pass up LACP packets */
 					head = cmp->b_next;
 					cmp->b_next = NULL;
-					aggr_recv_lacp(port, cmp);
+					aggr_recv_lacp(port, mrh, cmp);
 					cmp = head;
 					last = NULL;
 				}
@@ -151,7 +155,7 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
 		}
 		if (head != NULL) {
 			if (port->lp_collector_enabled)
-				mac_rx(grp->lg_mh, mrh, head);
+				aggr_mac_rx(grp->lg_mh, mrh, head);
 			else
 				freemsgchain(head);
 		}
diff --git a/usr/src/uts/common/io/aggr/aggr_send.c b/usr/src/uts/common/io/aggr/aggr_send.c
index 467f8541a3..9b4ad24621 100644
--- a/usr/src/uts/common/io/aggr/aggr_send.c
+++ b/usr/src/uts/common/io/aggr/aggr_send.c
@@ -55,18 +55,19 @@
 
 static uint16_t aggr_send_ip6_hdr_len(mblk_t *, ip6_t *);
 
-static uint_t
-aggr_send_port(aggr_grp_t *grp, mblk_t *mp)
+static uint64_t
+aggr_send_hash(aggr_grp_t *grp, mblk_t *mp)
 {
 	struct ether_header *ehp;
 	uint16_t sap;
 	uint_t skip_len;
 	uint8_t proto;
 	uint32_t policy = grp->lg_tx_policy;
-	uint32_t hash = 0;
+	uint64_t hash = 0;
 
 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
+	ASSERT(RW_READ_HELD(&grp->lg_tx_lock));
 
 	/* compute MAC hash */
 
@@ -207,7 +208,7 @@ again:
 	}
 
 done:
-	return (hash % grp->lg_ntx_ports);
+	return (hash);
 }
 
 /*
@@ -216,8 +217,7 @@ done:
 void
 aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
 {
-	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
-	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 
 	grp->lg_tx_policy = policy;
 }
@@ -231,35 +231,63 @@ aggr_m_tx(void *arg, mblk_t *mp)
 	aggr_grp_t *grp = arg;
 	aggr_port_t *port;
 	mblk_t *nextp;
-	const mac_txinfo_t *mtp;
+	mac_tx_cookie_t	cookie;
+	uint64_t hash;
+	void	*mytx_handle;
 
 	for (;;) {
-		AGGR_LACP_LOCK_READER(grp)
+		rw_enter(&grp->lg_tx_lock, RW_READER);
 		if (grp->lg_ntx_ports == 0) {
 			/*
 			 * We could have returned from aggr_m_start() before
 			 * the ports were actually attached. Drop the chain.
 			 */
-			AGGR_LACP_UNLOCK(grp)
+			rw_exit(&grp->lg_tx_lock);
 			freemsgchain(mp);
 			return (NULL);
 		}
+
 		nextp = mp->b_next;
 		mp->b_next = NULL;
 
-		port = grp->lg_tx_ports[aggr_send_port(grp, mp)];
-		ASSERT(port->lp_state == AGGR_PORT_STATE_ATTACHED);
+		hash = aggr_send_hash(grp, mp);
+		port = grp->lg_tx_ports[hash % grp->lg_ntx_ports];
 
 		/*
-		 * We store the transmit info pointer locally in case it
-		 * changes between loading mt_fn and mt_arg.
+		 * Bump the active Tx ref count so that the port won't
+		 * be deleted. The reference count will be dropped in mac_tx().
 		 */
-		mtp = port->lp_txinfo;
-		AGGR_LACP_UNLOCK(grp)
+		mytx_handle = mac_tx_hold(port->lp_mch);
+		rw_exit(&grp->lg_tx_lock);
 
-		if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
-			mp->b_next = nextp;
-			break;
+		if (mytx_handle == NULL) {
+			/*
+			 * The port is quiesced.
+			 */
+			freemsg(mp);
+		} else {
+			mblk_t	*ret_mp;
+
+			/*
+			 * It is fine that the port state changes now.
+			 * Set MAC_TX_NO_HOLD to inform mac_tx() not to bump
+			 * the active Tx ref again. Use hash as the hint so
+			 * to direct traffic to different TX rings. Note below
+			 * bit operation is needed to get the most benefit
+			 * from the mac_tx() hash algorithm.
+			 */
+			hash = (hash << 24 | hash << 16 | hash);
+			hash = (hash << 32 | hash);
+			cookie = mac_tx(port->lp_mch, mp, (uintptr_t)hash,
+			    MAC_TX_NO_ENQUEUE | MAC_TX_NO_HOLD, &ret_mp);
+
+			mac_tx_rele(port->lp_mch, mytx_handle);
+
+			if (cookie != NULL) {
+				ret_mp->b_next = nextp;
+				mp = ret_mp;
+				break;
+			}
 		}
 
 		if ((mp = nextp) == NULL)
@@ -276,6 +304,8 @@ aggr_send_port_enable(aggr_port_t *port)
 {
 	aggr_grp_t *grp = port->lp_grp;
 
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+
 	if (port->lp_tx_enabled || (port->lp_state !=
 	    AGGR_PORT_STATE_ATTACHED)) {
 		/* already enabled or port not yet attached */
@@ -285,6 +315,7 @@ aggr_send_port_enable(aggr_port_t *port)
 	/*
 	 * Add to group's array of tx ports.
 	 */
+	rw_enter(&grp->lg_tx_lock, RW_WRITER);
 	if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) {
 		/* current array too small */
 		aggr_port_t **new_ports;
@@ -308,6 +339,7 @@ aggr_send_port_enable(aggr_port_t *port)
 
 	grp->lg_tx_ports[grp->lg_ntx_ports++] = port;
 	port->lp_tx_idx = grp->lg_ntx_ports-1;
+	rw_exit(&grp->lg_tx_lock);
 
 	port->lp_tx_enabled = B_TRUE;
 }
@@ -321,13 +353,15 @@ aggr_send_port_disable(aggr_port_t *port)
 	uint_t idx, ntx;
 	aggr_grp_t *grp = port->lp_grp;
 
-	ASSERT(RW_WRITE_HELD(&port->lp_lock));
+	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
+	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 
 	if (!port->lp_tx_enabled) {
 		/* not yet enabled */
 		return;
 	}
 
+	rw_enter(&grp->lg_tx_lock, RW_WRITER);
 	idx = port->lp_tx_idx;
 	ntx = grp->lg_ntx_ports;
 	ASSERT(idx < ntx);
@@ -347,6 +381,7 @@ aggr_send_port_disable(aggr_port_t *port)
 
 	port->lp_tx_idx = 0;
 	grp->lg_ntx_ports--;
+	rw_exit(&grp->lg_tx_lock);
 
 	port->lp_tx_enabled = B_FALSE;
 }
diff --git a/usr/src/uts/common/io/ath/ath_main.c b/usr/src/uts/common/io/ath/ath_main.c
index b18451e570..451f827415 100644
--- a/usr/src/uts/common/io/ath/ath_main.c
+++ b/usr/src/uts/common/io/ath/ath_main.c
@@ -132,7 +132,7 @@
 #include <sys/sunddi.h>
 #include <sys/pci.h>
 #include <sys/errno.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/dlpi.h>
 #include <sys/ethernet.h>
 #include <sys/list.h>
@@ -232,7 +232,6 @@ static mac_callbacks_t ath_m_callbacks = {
 	ath_m_multicst,
 	ath_m_unicst,
 	ath_m_tx,
-	NULL,		/* mc_resources; */
 	ath_m_ioctl,
 	NULL,		/* mc_getcapab */
 	NULL,
diff --git a/usr/src/uts/common/io/bge/bge.conf b/usr/src/uts/common/io/bge/bge.conf
index 71a44f851a..edabf29ab1 100644
--- a/usr/src/uts/common/io/bge/bge.conf
+++ b/usr/src/uts/common/io/bge/bge.conf
@@ -171,6 +171,6 @@ bge-known-subsystems	= 0x108e1647,
 # For BCM5705, BCM5782, etc, there are only 1 receive ring and 1 send ring.
 # Otherwise, there can be up to 16 receive rings and 4 send rings. 
 #
-bge-rx-rings		= 1;
+bge-rx-rings		= 16;
 bge-tx-rings		= 1;
 
diff --git a/usr/src/uts/common/io/bge/bge_chip2.c b/usr/src/uts/common/io/bge/bge_chip2.c
index 4c17aaa5a9..d91ac5f0f6 100644
--- a/usr/src/uts/common/io/bge/bge_chip2.c
+++ b/usr/src/uts/common/io/bge/bge_chip2.c
@@ -1838,29 +1838,13 @@ bge_nvmem_id(bge_t *bgep)
 static void
 bge_init_recv_rule(bge_t *bgep)
 {
-	bge_recv_rule_t *rulep;
+	bge_recv_rule_t *rulep = bgep->recv_rules;
 	uint32_t i;
 
 	/*
-	 * receive rule: direct all TCP traffic to ring RULE_MATCH_TO_RING
-	 * 1. to direct UDP traffic, set:
-	 * 	rulep->control = RULE_PROTO_CONTROL;
-	 * 	rulep->mask_value = RULE_UDP_MASK_VALUE;
-	 * 2. to direct ICMP traffic, set:
-	 * 	rulep->control = RULE_PROTO_CONTROL;
-	 * 	rulep->mask_value = RULE_ICMP_MASK_VALUE;
-	 * 3. to direct traffic by source ip, set:
-	 * 	rulep->control = RULE_SIP_CONTROL;
-	 * 	rulep->mask_value = RULE_SIP_MASK_VALUE;
+	 * Initialize receive rule registers.
+	 * Note that rules may persist across each bge_m_start/stop() call.
 	 */
-	rulep = bgep->recv_rules;
-	rulep->control = RULE_PROTO_CONTROL;
-	rulep->mask_value = RULE_TCP_MASK_VALUE;
-
-	/*
-	 * set receive rule registers
-	 */
-	rulep = bgep->recv_rules;
 	for (i = 0; i < RECV_RULES_NUM_MAX; i++, rulep++) {
 		bge_reg_put32(bgep, RECV_RULE_MASK_REG(i), rulep->mask_value);
 		bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(i), rulep->control);
@@ -2871,10 +2855,11 @@ bge_chip_sync(bge_t *bgep)
 			}
 			bge_reg_put32(bgep, MAC_TX_RANDOM_BACKOFF_REG, fill);
 			bge_reg_put64(bgep, MAC_ADDRESS_REG(j), macaddr);
-		}
 
-		BGE_DEBUG(("bge_chip_sync($%p) setting MAC address %012llx",
-			(void *)bgep, macaddr));
+			BGE_DEBUG(("bge_chip_sync($%p) "
+			    "setting MAC address %012llx",
+			    (void *)bgep, macaddr));
+		}
 #ifdef BGE_IPMI_ASF
 	}
 #endif
@@ -5515,14 +5500,25 @@ bge_chip_ioctl(bge_t *bgep, queue_t *wq, mblk_t *mp, struct iocblk *iocp)
 	/* NOTREACHED */
 }
 
+/* ARGSUSED */
 void
-bge_chip_blank(void *arg, time_t ticks, uint_t count)
+bge_chip_blank(void *arg, time_t ticks, uint_t count, int flag)
 {
-	bge_t *bgep = arg;
+	recv_ring_t *rrp = arg;
+	bge_t *bgep = rrp->bgep;
 
 	mutex_enter(bgep->genlock);
+	rrp->poll_flag = flag;
+#ifdef NOT_YET
+	/*
+	 * XXX-Sunay: Since most broadcom cards support only one
+	 * interrupt but multiple rx rings, we can't disable the
+	 * physical interrupt. This need to be done via capability
+	 * negotiation depending on the NIC.
+	 */
 	bge_reg_put32(bgep, RCV_COALESCE_TICKS_REG, ticks);
 	bge_reg_put32(bgep, RCV_COALESCE_MAX_BD_REG, count);
+#endif
 	if (bge_check_acc_handle(bgep, bgep->io_handle) != DDI_FM_OK)
 		ddi_fm_service_impact(bgep->devinfo, DDI_SERVICE_UNAFFECTED);
 	mutex_exit(bgep->genlock);
diff --git a/usr/src/uts/common/io/bge/bge_hw.h b/usr/src/uts/common/io/bge/bge_hw.h
index 2ebdc1a7a3..1974faea88 100644
--- a/usr/src/uts/common/io/bge/bge_hw.h
+++ b/usr/src/uts/common/io/bge/bge_hw.h
@@ -858,30 +858,53 @@ extern "C" {
 /*
  * Receive Rules definition
  */
-#define	RULE_MATCH_TO_RING		2
-	/* ring that traffic will go into when recv rule matches.	*/
-	/* value is between 1 and 16, not 0 and 15 */
-
+#define	ETHERHEADER_DEST_OFFSET		0x00
 #define	IPHEADER_PROTO_OFFSET		0x08
 #define	IPHEADER_SIP_OFFSET		0x0c
+#define	IPHEADER_DIP_OFFSET		0x10
+#define	TCPHEADER_SPORT_OFFSET		0x00
+#define	TCPHEADER_DPORT_OFFSET		0x02
+#define	UDPHEADER_SPORT_OFFSET		0x00
+#define	UDPHEADER_DPORT_OFFSET		0x02
+
+#define	RULE_MATCH(ring)	(RECV_RULE_CTL_ENABLE | RECV_RULE_CTL_OP_EQ | \
+				    RECV_RULE_CTL_CLASS((ring)))
+
+#define	RULE_MATCH_MASK(ring)	(RULE_MATCH(ring) | RECV_RULE_CTL_MASK)
+
+#define	RULE_DEST_MAC_1(ring)	(RULE_MATCH(ring) | \
+				    RECV_RULE_CTL_HEADER_FRAME | \
+				    ETHERHEADER_DEST_OFFSET)
+
+#define	RULE_DEST_MAC_2(ring)	(RULE_MATCH_MASK(ring) | \
+				    RECV_RULE_CTL_HEADER_FRAME | \
+				    ETHERHEADER_DEST_OFFSET + 4)
+
+#define	RULE_LOCAL_IP(ring)	(RULE_MATCH(ring) | RECV_RULE_CTL_HEADER_IP | \
+				    IPHEADER_DIP_OFFSET)
+
+#define	RULE_REMOTE_IP(ring)	(RULE_MATCH(ring) | RECV_RULE_CTL_HEADER_IP | \
+				    IPHEADER_SIP_OFFSET)
 
-#define	RULE_PROTO_CONTROL	(RECV_RULE_CTL_ENABLE | RECV_RULE_CTL_MASK | \
-				    RECV_RULE_CTL_OP_EQ | \
+#define	RULE_IP_PROTO(ring)	(RULE_MATCH_MASK(ring) | \
 				    RECV_RULE_CTL_HEADER_IP | \
-				    RECV_RULE_CTL_CLASS(RULE_MATCH_TO_RING) | \
 				    IPHEADER_PROTO_OFFSET)
-#define	RULE_TCP_MASK_VALUE		0x00ff0006
-#define	RULE_UDP_MASK_VALUE		0x00ff0011
-#define	RULE_ICMP_MASK_VALUE		0x00ff0001
 
-#define	RULE_SIP_ADDR			0x0a000001
-	/* ip address in 32-bit integer,such as, 0x0a000001 is "10.0.0.1" */
+#define	RULE_TCP_SPORT(ring)	(RULE_MATCH_MASK(ring) | \
+				    RECV_RULE_CTL_HEADER_TCP | \
+				    TCPHEADER_SPORT_OFFSET)
 
-#define	RULE_SIP_CONTROL	(RECV_RULE_CTL_ENABLE | RECV_RULE_CTL_OP_EQ | \
-				    RECV_RULE_CTL_HEADER_IP | \
-				    RECV_RULE_CTL_CLASS(RULE_MATCH_TO_RING) | \
-				    IPHEADER_SIP_OFFSET)
-#define	RULE_SIP_MASK_VALUE		RULE_SIP_ADDR
+#define	RULE_TCP_DPORT(ring)	(RULE_MATCH_MASK(ring) | \
+				    RECV_RULE_CTL_HEADER_TCP | \
+				    TCPHEADER_DPORT_OFFSET)
+
+#define	RULE_UDP_SPORT(ring)	(RULE_MATCH_MASK(ring) | \
+				    RECV_RULE_CTL_HEADER_UDP | \
+				    UDPHEADER_SPORT_OFFSET)
+
+#define	RULE_UDP_DPORT(ring)	(RULE_MATCH_MASK(ring) | \
+				    RECV_RULE_CTL_HEADER_UDP | \
+				    UDPHEADER_DPORT_OFFSET)
 
 /*
  * 1000BaseX low-level access registers
@@ -1686,6 +1709,14 @@ typedef struct {
 } bge_recv_rule_t;
 
 /*
+ * This describes which sub-rule slots are used by a particular rule.
+ */
+typedef struct {
+	int		start;
+	int		count;
+} bge_rule_info_t;
+
+/*
  * Indexes into the <buff_cons_index> array
  */
 #ifdef	_BIG_ENDIAN
diff --git a/usr/src/uts/common/io/bge/bge_impl.h b/usr/src/uts/common/io/bge/bge_impl.h
index 961bf14064..3d2b73f325 100644
--- a/usr/src/uts/common/io/bge/bge_impl.h
+++ b/usr/src/uts/common/io/bge/bge_impl.h
@@ -71,7 +71,7 @@ extern "C" {
 #include <sys/fm/util.h>
 #include <sys/fm/io/ddi.h>
 
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 
 #ifdef __amd64
@@ -397,6 +397,13 @@ typedef struct buff_ring {
 	void			*spare[4];	/* padding		*/
 } buff_ring_t;					/* 0x100 (256) bytes	*/
 
+typedef struct bge_multi_mac {
+	int		naddr;		/* total supported addresses */
+	int		naddrfree;	/* free addresses slots */
+	ether_addr_t	mac_addr[MAC_ADDRESS_REGS_MAX];
+	boolean_t	mac_addr_set[MAC_ADDRESS_REGS_MAX];
+} bge_multi_mac_t;
+
 /*
  * Software Receive (Return) Ring Control Block
  * There's one of these for each receiver return ring (up to 16).
@@ -418,7 +425,6 @@ typedef struct recv_ring {
 	volatile uint16_t	*prod_index_p;	/* (const) ptr to h/w	*/
 						/* "producer index"	*/
 						/* (in status block)	*/
-
 	/*
 	 * The rx_lock must be held when updating the h/w consumer index
 	 * mailbox register (*chip_mbox_reg), or the s/w consumer index
@@ -428,10 +434,16 @@ typedef struct recv_ring {
 						/* index mailbox offset	*/
 	kmutex_t		rx_lock[1];	/* serialize receive	*/
 	uint64_t		rx_next;	/* next slot to examine	*/
-	mac_resource_handle_t	handle;		/* per ring cookie	*/
-						/* ("producer index")	*/
+
+	mac_ring_handle_t	ring_handle;
+	mac_group_handle_t	ring_group_handle;
+	uint64_t		ring_gen_num;
+	bge_rule_info_t		*mac_addr_rule;
+	uint8_t			mac_addr_val[ETHERADDRL];
+	int			poll_flag;	/* Polling flag		*/
 } recv_ring_t;					/* 0x90 (144) bytes	*/
 
+
 /*
  * Send packet structure
  */
@@ -528,6 +540,7 @@ typedef struct send_ring {
 
 	sw_sbd_t		*sw_sbds; 	/* software descriptors	*/
 	uint64_t		mac_resid;	/* special per resource id */
+	uint64_t		pushed_bytes;
 } send_ring_t;					/* 0x100 (256) bytes	*/
 
 typedef struct {
@@ -760,6 +773,8 @@ typedef struct bge {
 	 * Note: they're not necessarily all used.
 	 */
 	buff_ring_t		buff[BGE_BUFF_RINGS_MAX]; /*  3*0x0100	*/
+
+	/* may be obsoleted */
 	recv_ring_t		recv[BGE_RECV_RINGS_MAX]; /* 16*0x0090	*/
 	send_ring_t		send[BGE_SEND_RINGS_MAX]; /* 16*0x0100	*/
 
@@ -1158,7 +1173,8 @@ int bge_chip_sync(bge_t *bgep, boolean_t asf_keeplive);
 int bge_chip_reset(bge_t *bgep, boolean_t enable_dma);
 int bge_chip_sync(bge_t *bgep);
 #endif
-void bge_chip_blank(void *arg, time_t ticks, uint_t count);
+void bge_chip_blank(void *arg, time_t ticks, uint_t count, int flag);
+extern mblk_t *bge_poll_ring(void *, int);
 uint_t bge_chip_factotum(caddr_t arg);
 void bge_chip_cyclic(void *arg);
 enum ioc_reply bge_chip_ioctl(bge_t *bgep, queue_t *wq, mblk_t *mp,
@@ -1222,6 +1238,7 @@ void bge_receive(bge_t *bgep, bge_status_t *bsp);
 
 /* bge_send.c */
 mblk_t *bge_m_tx(void *arg, mblk_t *mp);
+mblk_t *bge_ring_tx(void *arg, mblk_t *mp);
 void bge_recycle(bge_t *bgep, bge_status_t *bsp);
 uint_t bge_send_drain(caddr_t arg);
 
diff --git a/usr/src/uts/common/io/bge/bge_main2.c b/usr/src/uts/common/io/bge/bge_main2.c
index fc4407214e..c8cef32365 100644
--- a/usr/src/uts/common/io/bge/bge_main2.c
+++ b/usr/src/uts/common/io/bge/bge_main2.c
@@ -26,7 +26,9 @@
 
 #include "bge_impl.h"
 #include <sys/sdt.h>
+#include <sys/mac_provider.h>
 #include <sys/mac.h>
+#include <sys/mac_flow.h>
 
 /*
  * This is the string displayed by modinfo, etc.
@@ -52,6 +54,7 @@ static char default_mtu[] = "default_mtu";
 
 static int bge_add_intrs(bge_t *, int);
 static void bge_rem_intrs(bge_t *);
+static int bge_unicst_set(void *, const uint8_t *, int);
 
 /*
  * Describes the chip's DMA engine
@@ -104,16 +107,10 @@ static int		bge_m_start(void *);
 static void		bge_m_stop(void *);
 static int		bge_m_promisc(void *, boolean_t);
 static int		bge_m_multicst(void *, boolean_t, const uint8_t *);
-static int		bge_m_unicst(void *, const uint8_t *);
-static void		bge_m_resources(void *);
 static void		bge_m_ioctl(void *, queue_t *, mblk_t *);
 static boolean_t	bge_m_getcapab(void *, mac_capab_t, void *);
 static int		bge_unicst_set(void *, const uint8_t *,
-    mac_addr_slot_t);
-static int		bge_m_unicst_add(void *, mac_multi_addr_t *);
-static int		bge_m_unicst_remove(void *, mac_addr_slot_t);
-static int		bge_m_unicst_modify(void *, mac_multi_addr_t *);
-static int		bge_m_unicst_get(void *, mac_multi_addr_t *);
+    int);
 static int		bge_m_setprop(void *, const char *, mac_prop_id_t,
     uint_t, const void *);
 static int		bge_m_getprop(void *, const char *, mac_prop_id_t,
@@ -123,8 +120,7 @@ static int		bge_set_priv_prop(bge_t *, const char *, uint_t,
 static int		bge_get_priv_prop(bge_t *, const char *, uint_t,
     uint_t, void *);
 
-#define	BGE_M_CALLBACK_FLAGS\
-	(MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
+#define	BGE_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
 
 static mac_callbacks_t bge_m_callbacks = {
 	BGE_M_CALLBACK_FLAGS,
@@ -133,9 +129,8 @@ static mac_callbacks_t bge_m_callbacks = {
 	bge_m_stop,
 	bge_m_promisc,
 	bge_m_multicst,
-	bge_m_unicst,
+	NULL,
 	bge_m_tx,
-	bge_m_resources,
 	bge_m_ioctl,
 	bge_m_getcapab,
 	NULL,
@@ -152,6 +147,7 @@ mac_priv_prop_t bge_priv_prop[] = {
 #define	BGE_MAX_PRIV_PROPS \
 	(sizeof (bge_priv_prop) / sizeof (mac_priv_prop_t))
 
+uint8_t zero_addr[6] = {0, 0, 0, 0, 0, 0};
 /*
  * ========== Transmit and receive ring reinitialisation ==========
  */
@@ -590,23 +586,10 @@ bge_m_start(void *arg)
 }
 
 /*
- *	bge_m_unicst() -- set the physical network address
- */
-static int
-bge_m_unicst(void *arg, const uint8_t *macaddr)
-{
-	/*
-	 * Request to set address in
-	 * address slot 0, i.e., default address
-	 */
-	return (bge_unicst_set(arg, macaddr, 0));
-}
-
-/*
  *	bge_unicst_set() -- set the physical network address
  */
 static int
-bge_unicst_set(void *arg, const uint8_t *macaddr, mac_addr_slot_t slot)
+bge_unicst_set(void *arg, const uint8_t *macaddr, int slot)
 {
 	bge_t *bgep = arg;		/* private device info	*/
 
@@ -693,160 +676,6 @@ bge_unicst_set(void *arg, const uint8_t *macaddr, mac_addr_slot_t slot)
 	return (0);
 }
 
-/*
- * The following four routines are used as callbacks for multiple MAC
- * address support:
- *    -  bge_m_unicst_add(void *, mac_multi_addr_t *);
- *    -  bge_m_unicst_remove(void *, mac_addr_slot_t);
- *    -  bge_m_unicst_modify(void *, mac_multi_addr_t *);
- *    -  bge_m_unicst_get(void *, mac_multi_addr_t *);
- */
-
-/*
- * bge_m_unicst_add() - will find an unused address slot, set the
- * address value to the one specified, reserve that slot and enable
- * the NIC to start filtering on the new MAC address.
- * address slot. Returns 0 on success.
- */
-static int
-bge_m_unicst_add(void *arg, mac_multi_addr_t *maddr)
-{
-	bge_t *bgep = arg;		/* private device info	*/
-	mac_addr_slot_t slot;
-	int err;
-
-	if (mac_unicst_verify(bgep->mh,
-	    maddr->mma_addr, maddr->mma_addrlen) == B_FALSE)
-		return (EINVAL);
-
-	mutex_enter(bgep->genlock);
-	if (bgep->unicst_addr_avail == 0) {
-		/* no slots available */
-		mutex_exit(bgep->genlock);
-		return (ENOSPC);
-	}
-
-	/*
-	 * Primary/default address is in slot 0. The next three
-	 * addresses are the multiple MAC addresses. So multiple
-	 * MAC address 0 is in slot 1, 1 in slot 2, and so on.
-	 * So the first multiple MAC address resides in slot 1.
-	 */
-	for (slot = 1; slot < bgep->unicst_addr_total; slot++) {
-		if (bgep->curr_addr[slot].set == B_FALSE) {
-			bgep->curr_addr[slot].set = B_TRUE;
-			break;
-		}
-	}
-
-	ASSERT(slot < bgep->unicst_addr_total);
-	bgep->unicst_addr_avail--;
-	mutex_exit(bgep->genlock);
-	maddr->mma_slot = slot;
-
-	if ((err = bge_unicst_set(bgep, maddr->mma_addr, slot)) != 0) {
-		mutex_enter(bgep->genlock);
-		bgep->curr_addr[slot].set = B_FALSE;
-		bgep->unicst_addr_avail++;
-		mutex_exit(bgep->genlock);
-	}
-	return (err);
-}
-
-/*
- * bge_m_unicst_remove() - removes a MAC address that was added by a
- * call to bge_m_unicst_add(). The slot number that was returned in
- * add() is passed in the call to remove the address.
- * Returns 0 on success.
- */
-static int
-bge_m_unicst_remove(void *arg, mac_addr_slot_t slot)
-{
-	bge_t *bgep = arg;		/* private device info	*/
-
-	if (slot <= 0 || slot >= bgep->unicst_addr_total)
-		return (EINVAL);
-
-	mutex_enter(bgep->genlock);
-	if (bgep->curr_addr[slot].set == B_TRUE) {
-		bgep->curr_addr[slot].set = B_FALSE;
-		bgep->unicst_addr_avail++;
-		mutex_exit(bgep->genlock);
-		/*
-		 * Copy the default address to the passed slot
-		 */
-		return (bge_unicst_set(bgep, bgep->curr_addr[0].addr, slot));
-	}
-	mutex_exit(bgep->genlock);
-	return (EINVAL);
-}
-
-/*
- * bge_m_unicst_modify() - modifies the value of an address that
- * has been added by bge_m_unicst_add(). The new address, address
- * length and the slot number that was returned in the call to add
- * should be passed to bge_m_unicst_modify(). mma_flags should be
- * set to 0. Returns 0 on success.
- */
-static int
-bge_m_unicst_modify(void *arg, mac_multi_addr_t *maddr)
-{
-	bge_t *bgep = arg;		/* private device info	*/
-	mac_addr_slot_t slot;
-
-	if (mac_unicst_verify(bgep->mh,
-	    maddr->mma_addr, maddr->mma_addrlen) == B_FALSE)
-		return (EINVAL);
-
-	slot = maddr->mma_slot;
-
-	if (slot <= 0 || slot >= bgep->unicst_addr_total)
-		return (EINVAL);
-
-	mutex_enter(bgep->genlock);
-	if (bgep->curr_addr[slot].set == B_TRUE) {
-		mutex_exit(bgep->genlock);
-		return (bge_unicst_set(bgep, maddr->mma_addr, slot));
-	}
-	mutex_exit(bgep->genlock);
-
-	return (EINVAL);
-}
-
-/*
- * bge_m_unicst_get() - will get the MAC address and all other
- * information related to the address slot passed in mac_multi_addr_t.
- * mma_flags should be set to 0 in the call.
- * On return, mma_flags can take the following values:
- * 1) MMAC_SLOT_UNUSED
- * 2) MMAC_SLOT_USED | MMAC_VENDOR_ADDR
- * 3) MMAC_SLOT_UNUSED | MMAC_VENDOR_ADDR
- * 4) MMAC_SLOT_USED
- */
-static int
-bge_m_unicst_get(void *arg, mac_multi_addr_t *maddr)
-{
-	bge_t *bgep = arg;		/* private device info	*/
-	mac_addr_slot_t slot;
-
-	slot = maddr->mma_slot;
-
-	if (slot <= 0 || slot >= bgep->unicst_addr_total)
-		return (EINVAL);
-
-	mutex_enter(bgep->genlock);
-	if (bgep->curr_addr[slot].set == B_TRUE) {
-		ethaddr_copy(bgep->curr_addr[slot].addr,
-		    maddr->mma_addr);
-		maddr->mma_flags = MMAC_SLOT_USED;
-	} else {
-		maddr->mma_flags = MMAC_SLOT_UNUSED;
-	}
-	mutex_exit(bgep->genlock);
-
-	return (0);
-}
-
 extern void bge_wake_factotum(bge_t *);
 
 static boolean_t
@@ -1576,6 +1405,295 @@ bge_m_promisc(void *arg, boolean_t on)
 	return (0);
 }
 
+/*
+ * Find the slot for the specified unicast address
+ */
+int
+bge_unicst_find(bge_t *bgep, const uint8_t *mac_addr)
+{
+	int slot;
+
+	ASSERT(mutex_owned(bgep->genlock));
+
+	for (slot = 0; slot < bgep->unicst_addr_total; slot++) {
+		if (bcmp(bgep->curr_addr[slot].addr, mac_addr, ETHERADDRL) == 0)
+			return (slot);
+	}
+
+	return (-1);
+}
+
+/*
+ * Programs the classifier to start steering packets matching 'mac_addr' to the
+ * specified ring 'arg'.
+ */
+static int
+bge_addmac(void *arg, const uint8_t *mac_addr)
+{
+	recv_ring_t *rrp = (recv_ring_t *)arg;
+	bge_t		*bgep = rrp->bgep;
+	bge_recv_rule_t	*rulep = bgep->recv_rules;
+	bge_rule_info_t	*rinfop = NULL;
+	uint8_t		ring = (uint8_t)(rrp - bgep->recv) + 1;
+	int		i;
+	uint16_t	tmp16;
+	uint32_t	tmp32;
+	int		slot;
+	int		err;
+
+	mutex_enter(bgep->genlock);
+	if (bgep->unicst_addr_avail == 0) {
+		mutex_exit(bgep->genlock);
+		return (ENOSPC);
+	}
+
+	/*
+	 * First add the unicast address to a available slot.
+	 */
+	slot = bge_unicst_find(bgep, mac_addr);
+	ASSERT(slot == -1);
+
+	for (slot = 0; slot < bgep->unicst_addr_total; slot++) {
+		if (!bgep->curr_addr[slot].set) {
+			bgep->curr_addr[slot].set = B_TRUE;
+			break;
+		}
+	}
+
+	ASSERT(slot < bgep->unicst_addr_total);
+	bgep->unicst_addr_avail--;
+	mutex_exit(bgep->genlock);
+
+	if ((err = bge_unicst_set(bgep, mac_addr, slot)) != 0)
+		goto fail;
+
+	/* A rule is already here. Deny this.  */
+	if (rrp->mac_addr_rule != NULL) {
+		err = ether_cmp(mac_addr, rrp->mac_addr_val) ? EEXIST : EBUSY;
+		goto fail;
+	}
+
+	/*
+	 * Allocate a bge_rule_info_t to keep track of which rule slots
+	 * are being used.
+	 */
+	rinfop = kmem_zalloc(sizeof (bge_rule_info_t), KM_NOSLEEP);
+	if (rinfop == NULL) {
+		err = ENOMEM;
+		goto fail;
+	}
+
+	/*
+	 * Look for the starting slot to place the rules.
+	 * The two slots we reserve must be contiguous.
+	 */
+	for (i = 0; i + 1 < RECV_RULES_NUM_MAX; i++)
+		if ((rulep[i].control & RECV_RULE_CTL_ENABLE) == 0 &&
+		    (rulep[i+1].control & RECV_RULE_CTL_ENABLE) == 0)
+			break;
+
+	ASSERT(i + 1 < RECV_RULES_NUM_MAX);
+
+	bcopy(mac_addr, &tmp32, sizeof (tmp32));
+	rulep[i].mask_value = ntohl(tmp32);
+	rulep[i].control = RULE_DEST_MAC_1(ring) | RECV_RULE_CTL_AND;
+	bge_reg_put32(bgep, RECV_RULE_MASK_REG(i), rulep[i].mask_value);
+	bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(i), rulep[i].control);
+
+	bcopy(mac_addr + 4, &tmp16, sizeof (tmp16));
+	rulep[i+1].mask_value = 0xffff0000 | ntohs(tmp16);
+	rulep[i+1].control = RULE_DEST_MAC_2(ring);
+	bge_reg_put32(bgep, RECV_RULE_MASK_REG(i+1), rulep[i+1].mask_value);
+	bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(i+1), rulep[i+1].control);
+	rinfop->start = i;
+	rinfop->count = 2;
+
+	rrp->mac_addr_rule = rinfop;
+	bcopy(mac_addr, rrp->mac_addr_val, ETHERADDRL);
+
+	return (0);
+
+fail:
+	/* Clear the address just set */
+	(void) bge_unicst_set(bgep, zero_addr, slot);
+	mutex_enter(bgep->genlock);
+	bgep->curr_addr[slot].set = B_FALSE;
+	bgep->unicst_addr_avail++;
+	mutex_exit(bgep->genlock);
+
+	return (err);
+}
+
+/*
+ * Stop classifying packets matching the MAC address to the specified ring.
+ */
+static int
+bge_remmac(void *arg, const uint8_t *mac_addr)
+{
+	recv_ring_t	*rrp = (recv_ring_t *)arg;
+	bge_t		*bgep = rrp->bgep;
+	bge_recv_rule_t *rulep = bgep->recv_rules;
+	bge_rule_info_t *rinfop = rrp->mac_addr_rule;
+	int		start;
+	int		slot;
+	int		err;
+
+	/*
+	 * Remove the MAC address from its slot.
+	 */
+	mutex_enter(bgep->genlock);
+	slot = bge_unicst_find(bgep, mac_addr);
+	if (slot == -1) {
+		mutex_exit(bgep->genlock);
+		return (EINVAL);
+	}
+
+	ASSERT(bgep->curr_addr[slot].set);
+	mutex_exit(bgep->genlock);
+
+	if ((err = bge_unicst_set(bgep, zero_addr, slot)) != 0)
+		return (err);
+
+	if (rinfop == NULL || ether_cmp(mac_addr, rrp->mac_addr_val) != 0)
+		return (EINVAL);
+
+	start = rinfop->start;
+	rulep[start].mask_value = 0;
+	rulep[start].control = 0;
+	bge_reg_put32(bgep, RECV_RULE_MASK_REG(start), rulep[start].mask_value);
+	bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(start), rulep[start].control);
+	start++;
+	rulep[start].mask_value = 0;
+	rulep[start].control = 0;
+	bge_reg_put32(bgep, RECV_RULE_MASK_REG(start), rulep[start].mask_value);
+	bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(start), rulep[start].control);
+
+	kmem_free(rinfop, sizeof (bge_rule_info_t));
+	rrp->mac_addr_rule = NULL;
+	bzero(rrp->mac_addr_val, ETHERADDRL);
+
+	mutex_enter(bgep->genlock);
+	bgep->curr_addr[slot].set = B_FALSE;
+	bgep->unicst_addr_avail++;
+	mutex_exit(bgep->genlock);
+
+	return (0);
+}
+
+static int
+bge_flag_intr_enable(mac_intr_handle_t ih)
+{
+	recv_ring_t *rrp = (recv_ring_t *)ih;
+	bge_t *bgep = rrp->bgep;
+
+	mutex_enter(bgep->genlock);
+	rrp->poll_flag = 0;
+	mutex_exit(bgep->genlock);
+
+	return (0);
+}
+
+static int
+bge_flag_intr_disable(mac_intr_handle_t ih)
+{
+	recv_ring_t *rrp = (recv_ring_t *)ih;
+	bge_t *bgep = rrp->bgep;
+
+	mutex_enter(bgep->genlock);
+	rrp->poll_flag = 1;
+	mutex_exit(bgep->genlock);
+
+	return (0);
+}
+
+static int
+bge_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
+{
+	recv_ring_t *rx_ring;
+
+	rx_ring = (recv_ring_t *)rh;
+	mutex_enter(rx_ring->rx_lock);
+	rx_ring->ring_gen_num = mr_gen_num;
+	mutex_exit(rx_ring->rx_lock);
+	return (0);
+}
+
+
+/*
+ * Callback funtion for MAC layer to register all rings
+ * for given ring_group, noted by rg_index.
+ */
+void
+bge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
+    const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+	bge_t *bgep = arg;
+	mac_intr_t *mintr;
+
+	switch (rtype) {
+	case MAC_RING_TYPE_RX: {
+		recv_ring_t *rx_ring;
+		ASSERT(rg_index >= 0 && rg_index < MIN(bgep->chipid.rx_rings,
+		    MAC_ADDRESS_REGS_MAX) && index == 0);
+
+		rx_ring = &bgep->recv[rg_index];
+		rx_ring->ring_handle = rh;
+
+		infop->mri_driver = (mac_ring_driver_t)rx_ring;
+		infop->mri_start = bge_ring_start;
+		infop->mri_stop = NULL;
+		infop->mri_poll = bge_poll_ring;
+
+		mintr = &infop->mri_intr;
+		mintr->mi_handle = (mac_intr_handle_t)rx_ring;
+		mintr->mi_enable = bge_flag_intr_enable;
+		mintr->mi_disable = bge_flag_intr_disable;
+
+		break;
+	}
+	case MAC_RING_TYPE_TX:
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
+/*
+ * Fill infop passed as argument
+ * fill in respective ring_group info
+ * Each group has a single ring in it. We keep it simple
+ * and use the same internal handle for rings and groups.
+ */
+void
+bge_fill_group(void *arg, mac_ring_type_t rtype, const int rg_index,
+	mac_group_info_t *infop, mac_group_handle_t gh)
+{
+	bge_t *bgep = arg;
+
+	switch (rtype) {
+	case MAC_RING_TYPE_RX: {
+		recv_ring_t *rx_ring;
+
+		ASSERT(rg_index >= 0 && rg_index < MIN(bgep->chipid.rx_rings,
+		    MAC_ADDRESS_REGS_MAX));
+		rx_ring = &bgep->recv[rg_index];
+		rx_ring->ring_group_handle = gh;
+
+		infop->mgi_driver = (mac_group_driver_t)rx_ring;
+		infop->mgi_start = NULL;
+		infop->mgi_stop = NULL;
+		infop->mgi_addmac = bge_addmac;
+		infop->mgi_remmac = bge_remmac;
+		infop->mgi_count = 1;
+		break;
+	}
+	case MAC_RING_TYPE_TX:
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
 /*ARGSUSED*/
 static boolean_t
 bge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
@@ -1589,38 +1707,20 @@ bge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 		*txflags = HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM;
 		break;
 	}
+	case MAC_CAPAB_RINGS: {
+		mac_capab_rings_t *cap_rings = cap_data;
 
-	case MAC_CAPAB_POLL:
-		/*
-		 * There's nothing for us to fill in, simply returning
-		 * B_TRUE stating that we support polling is sufficient.
-		 */
-		break;
-
-	case MAC_CAPAB_MULTIADDRESS: {
-		multiaddress_capab_t	*mmacp = cap_data;
+		/* Temporarily disable multiple tx rings. */
+		if (cap_rings->mr_type != MAC_RING_TYPE_RX)
+			return (B_FALSE);
 
-		mutex_enter(bgep->genlock);
-		/*
-		 * The number of MAC addresses made available by
-		 * this capability is one less than the total as
-		 * the primary address in slot 0 is counted in
-		 * the total.
-		 */
-		mmacp->maddr_naddr = bgep->unicst_addr_total - 1;
-		mmacp->maddr_naddrfree = bgep->unicst_addr_avail;
-		/* No multiple factory addresses, set mma_flag to 0 */
-		mmacp->maddr_flag = 0;
-		mmacp->maddr_handle = bgep;
-		mmacp->maddr_add = bge_m_unicst_add;
-		mmacp->maddr_remove = bge_m_unicst_remove;
-		mmacp->maddr_modify = bge_m_unicst_modify;
-		mmacp->maddr_get = bge_m_unicst_get;
-		mmacp->maddr_reserve = NULL;
-		mutex_exit(bgep->genlock);
+		cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+		cap_rings->mr_rnum = cap_rings->mr_gnum =
+		    MIN(bgep->chipid.rx_rings, MAC_ADDRESS_REGS_MAX);
+		cap_rings->mr_rget = bge_fill_ring;
+		cap_rings->mr_gget = bge_fill_group;
 		break;
 	}
-
 	default:
 		return (B_FALSE);
 	}
@@ -1889,43 +1989,6 @@ bge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
 	}
 }
 
-static void
-bge_resources_add(bge_t *bgep, time_t time, uint_t pkt_cnt)
-{
-
-	recv_ring_t *rrp;
-	mac_rx_fifo_t mrf;
-	int ring;
-
-	/*
-	 * Register Rx rings as resources and save mac
-	 * resource id for future reference
-	 */
-	mrf.mrf_type = MAC_RX_FIFO;
-	mrf.mrf_blank = bge_chip_blank;
-	mrf.mrf_arg = (void *)bgep;
-	mrf.mrf_normal_blank_time = time;
-	mrf.mrf_normal_pkt_count = pkt_cnt;
-
-	for (ring = 0; ring < bgep->chipid.rx_rings; ring++) {
-		rrp = &bgep->recv[ring];
-		rrp->handle = mac_resource_add(bgep->mh,
-		    (mac_resource_t *)&mrf);
-	}
-}
-
-static void
-bge_m_resources(void *arg)
-{
-	bge_t *bgep = arg;
-
-	mutex_enter(bgep->genlock);
-
-	bge_resources_add(bgep, bgep->chipid.rx_ticks_norm,
-	    bgep->chipid.rx_count_norm);
-	mutex_exit(bgep->genlock);
-}
-
 /*
  * ========== Per-instance setup/teardown code ==========
  */
@@ -3404,29 +3467,23 @@ bge_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
 	 * Determine whether to override the chip's own MAC address
 	 */
 	bge_find_mac_address(bgep, cidp);
-	ethaddr_copy(cidp->vendor_addr.addr, bgep->curr_addr[0].addr);
-	bgep->curr_addr[0].set = B_TRUE;
 
 	bgep->unicst_addr_total = MAC_ADDRESS_REGS_MAX;
-	/*
-	 * Address available is one less than MAX
-	 * as primary address is not advertised
-	 * as a multiple MAC address.
-	 */
-	bgep->unicst_addr_avail = MAC_ADDRESS_REGS_MAX - 1;
+	bgep->unicst_addr_avail = MAC_ADDRESS_REGS_MAX;
 
 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
 		goto attach_fail;
 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
 	macp->m_driver = bgep;
 	macp->m_dip = devinfo;
-	macp->m_src_addr = bgep->curr_addr[0].addr;
+	macp->m_src_addr = cidp->vendor_addr.addr;
 	macp->m_callbacks = &bge_m_callbacks;
 	macp->m_min_sdu = 0;
 	macp->m_max_sdu = cidp->ethmax_size - sizeof (struct ether_header);
 	macp->m_margin = VLAN_TAGSZ;
 	macp->m_priv_props = bge_priv_prop;
 	macp->m_priv_prop_count = BGE_MAX_PRIV_PROPS;
+	macp->m_v12n = MAC_VIRT_LEVEL1;
 
 	/*
 	 * Finally, we're ready to register ourselves with the MAC layer
diff --git a/usr/src/uts/common/io/bge/bge_recv2.c b/usr/src/uts/common/io/bge/bge_recv2.c
index 60df201711..2c8bb20f71 100644
--- a/usr/src/uts/common/io/bge/bge_recv2.c
+++ b/usr/src/uts/common/io/bge/bge_recv2.c
@@ -24,8 +24,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include "bge_impl.h"
 
 #define	U32TOPTR(x)	((void *)(uintptr_t)(uint32_t)(x))
@@ -274,7 +272,9 @@ error:
  * the chip to indicate the packets it has accepted from the ring.
  */
 static mblk_t *bge_receive_ring(bge_t *bgep, recv_ring_t *rrp);
+#ifndef	DEBUG
 #pragma	inline(bge_receive_ring)
+#endif
 
 static mblk_t *
 bge_receive_ring(bge_t *bgep, recv_ring_t *rrp)
@@ -328,36 +328,61 @@ bge_receive_ring(bge_t *bgep, recv_ring_t *rrp)
 }
 
 /*
- * Receive all packets in all rings.
- *
- * To give priority to low-numbered rings, whenever we have received any
- * packets in any ring except 0, we restart scanning again from ring 0.
- * Thus, for example, if rings 0, 3, and 10 are carrying traffic, the
- * pattern of receives might go 0, 3, 10, 3, 0, 10, 0:
- *
- *	0	found some - receive them
- *	1..2					none found
- *	3	found some - receive them	and restart scan
- *	0..9					none found
- *	10	found some - receive them	and restart scan
- *	0..2					none found
- *	3	found some more - receive them	and restart scan
- *	0	found some more - receive them
- *	1..9					none found
- *	10	found some more - receive them	and restart scan
- *	0	found some more - receive them
- *	1..15					none found
- *
- * The routine returns only when a complete scan has been performed either
- * without finding any packets to receive or BGE_MAXPKT_RCVED packets were
- * received from ring 0 and other rings (if used) are empty.
+ * XXX: Poll a particular ring. The implementation is incomplete.
+ * Once the ring interrupts are disabled, we need to do bge_recyle()
+ * for the ring as well and re enable the ring interrupt automatically
+ * if the poll doesn't find any packets in the ring. We need to
+ * have MSI-X interrupts support for this.
  *
- * Note that driver-defined locks may *NOT* be held across calls
- * to gld_recv().
- *
- * Note: the expression (BGE_RECV_RINGS_USED > 1), yields a compile-time
- * constant and allows the compiler to optimise away the outer do-loop
- * if only one receive ring is being used.
+ * The basic poll policy is that rings that are dealing with explicit
+ * flows (like TCP or some service) and are marked as such should
+ * have their own MSI-X interrupt per ring. bge_intr() should leave
+ * that interrupt disabled after an upcall. The ring is in poll mode.
+ * When a poll thread comes down and finds nothing, the MSI-X interrupt
+ * is automatically enabled. Squeue needs to deal with the race of
+ * a new interrupt firing and reaching before poll thread returns.
+ */
+mblk_t *
+bge_poll_ring(void *arg, int bytes_to_pickup)
+{
+	recv_ring_t *rrp = arg;
+	bge_t *bgep = rrp->bgep;
+	bge_rbd_t *hw_rbd_p;
+	uint64_t slot;
+	mblk_t *head;
+	mblk_t **tail;
+	mblk_t *mp;
+	size_t sz = 0;
+
+	mutex_enter(rrp->rx_lock);
+
+	/*
+	 * Sync (all) the receive ring descriptors
+	 * before accepting the packets they describe
+	 */
+	DMA_SYNC(rrp->desc, DDI_DMA_SYNC_FORKERNEL);
+	hw_rbd_p = DMA_VPTR(rrp->desc);
+	head = NULL;
+	tail = &head;
+	slot = rrp->rx_next;
+
+	/* Note: volatile */
+	while ((slot != *rrp->prod_index_p) && (sz <= bytes_to_pickup)) {
+		if ((mp = bge_receive_packet(bgep, &hw_rbd_p[slot])) != NULL) {
+			*tail = mp;
+			sz += msgdsize(mp);
+			tail = &mp->b_next;
+		}
+		rrp->rx_next = slot = NEXT(slot, rrp->desc.nslots);
+	}
+
+	bge_mbx_put(bgep, rrp->chip_mbx_reg, rrp->rx_next);
+	mutex_exit(rrp->rx_lock);
+	return (head);
+}
+
+/*
+ * Receive all packets in all rings.
  */
 void bge_receive(bge_t *bgep, bge_status_t *bsp);
 #pragma	no_inline(bge_receive)
@@ -366,41 +391,31 @@ void
 bge_receive(bge_t *bgep, bge_status_t *bsp)
 {
 	recv_ring_t *rrp;
-	uint64_t ring;
-	uint64_t rx_rings = bgep->chipid.rx_rings;
+	uint64_t index;
 	mblk_t *mp;
 
-restart:
-	ring = 0;
-	rrp = &bgep->recv[ring];
-	do {
+	for (index = 0; index < bgep->chipid.rx_rings; index++) {
+		/*
+		 * Start from the first ring.
+		 */
+		rrp = &bgep->recv[index];
+
 		/*
 		 * For each ring, (rrp->prod_index_p) points to the
 		 * proper index within the status block (which has
 		 * already been sync'd by the caller)
 		 */
-		ASSERT(rrp->prod_index_p == RECV_INDEX_P(bsp, ring));
+		ASSERT(rrp->prod_index_p == RECV_INDEX_P(bsp, index));
 
-		if (*rrp->prod_index_p == rrp->rx_next)
+		if (*rrp->prod_index_p == rrp->rx_next || rrp->poll_flag)
 			continue;		/* no packets		*/
 		if (mutex_tryenter(rrp->rx_lock) == 0)
 			continue;		/* already in process	*/
 		mp = bge_receive_ring(bgep, rrp);
 		mutex_exit(rrp->rx_lock);
 
-		if (mp != NULL) {
-			mac_rx(bgep->mh, rrp->handle, mp);
-
-			/*
-			 * Restart from ring 0, if the driver is compiled
-			 * with multiple rings and we're not on ring 0 now
-			 */
-			if (rx_rings > 1 && ring > 0)
-				goto restart;
-		}
-
-		/*
-		 * Loop over all rings (if there *are* multiple rings)
-		 */
-	} while (++rrp, ++ring < rx_rings);
+		if (mp != NULL)
+			mac_rx_ring(bgep->mh, rrp->ring_handle, mp,
+			    rrp->ring_gen_num);
+	}
 }
diff --git a/usr/src/uts/common/io/bge/bge_send.c b/usr/src/uts/common/io/bge/bge_send.c
index a8c6f16ac2..01b70fd13d 100644
--- a/usr/src/uts/common/io/bge/bge_send.c
+++ b/usr/src/uts/common/io/bge/bge_send.c
@@ -24,8 +24,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include "bge_impl.h"
 
 
@@ -484,11 +482,11 @@ start_tx:
 	mutex_exit(srp->tx_lock);
 }
 
-static boolean_t
-bge_send(bge_t *bgep, mblk_t *mp)
+mblk_t *
+bge_ring_tx(void *arg, mblk_t *mp)
 {
-	uint_t ring = 0;	/* use ring 0 */
-	send_ring_t *srp;
+	send_ring_t *srp = arg;
+	bge_t *bgep = srp->bgep;
 	struct ether_vlan_header *ehp;
 	bge_queue_item_t *txbuf_item;
 	sw_txbuf_t *txbuf;
@@ -499,7 +497,6 @@ bge_send(bge_t *bgep, mblk_t *mp)
 	char *pbuf;
 
 	ASSERT(mp->b_next == NULL);
-	srp = &bgep->send[ring];
 
 	/*
 	 * Get a s/w tx buffer first
@@ -510,7 +507,7 @@ bge_send(bge_t *bgep, mblk_t *mp)
 		srp->tx_nobuf++;
 		bgep->tx_resched_needed = B_TRUE;
 		bge_send_serial(bgep, srp);
-		return (B_FALSE);
+		return (mp);
 	}
 
 	/*
@@ -564,12 +561,23 @@ bge_send(bge_t *bgep, mblk_t *mp)
 	 */
 	bge_send_serial(bgep, srp);
 
+	srp->pushed_bytes += MBLKL(mp);
+
 	/*
 	 * We've copied the contents, the message can be freed right away
 	 */
 	freemsg(mp);
+	return (NULL);
+}
+
+static mblk_t *
+bge_send(bge_t *bgep, mblk_t *mp)
+{
+	send_ring_t *ring;
+
+	ring = &bgep->send[0];	/* ring 0 */
 
-	return (B_TRUE);
+	return (bge_ring_tx(ring, mp));
 }
 
 uint_t
@@ -621,7 +629,7 @@ bge_m_tx(void *arg, mblk_t *mp)
 		next = mp->b_next;
 		mp->b_next = NULL;
 
-		if (!bge_send(bgep, mp)) {
+		if ((mp = bge_send(bgep, mp)) != NULL) {
 			mp->b_next = next;
 			break;
 		}
diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c
index 615006d86e..55e4d161db 100644
--- a/usr/src/uts/common/io/dld/dld_drv.c
+++ b/usr/src/uts/common/io/dld/dld_drv.c
@@ -31,14 +31,17 @@
 #include	<sys/mkdev.h>
 #include	<sys/modctl.h>
 #include	<sys/stat.h>
-#include	<sys/vlan.h>
-#include	<sys/mac.h>
 #include	<sys/dld_impl.h>
 #include	<sys/dls_impl.h>
 #include	<sys/softmac.h>
-#include 	<sys/vlan.h>
-#include	<sys/policy.h>
+#include	<sys/mac.h>
+#include	<sys/mac_ether.h>
+#include	<sys/mac_client.h>
+#include	<sys/mac_client_impl.h>
+#include	<sys/mac_client_priv.h>
 #include	<inet/common.h>
+#include	<sys/policy.h>
+#include	<sys/priv_names.h>
 
 static void	drv_init(void);
 static int	drv_fini(void);
@@ -150,6 +153,7 @@ drv_init(void)
 {
 	drv_secobj_init();
 	dld_str_init();
+
 	/*
 	 * Create a hash table for autopush configuration.
 	 */
@@ -179,7 +183,6 @@ drv_fini(void)
 	rw_enter(&dld_ap_hash_lock, RW_READER);
 	mod_hash_walk(dld_ap_hashp, drv_ap_exist, &exist);
 	rw_exit(&dld_ap_hash_lock);
-
 	if (exist)
 		return (EBUSY);
 
@@ -314,24 +317,33 @@ drv_open(dev_t *devp, int flag, int sflag, cred_t *credp)
  */
 /* ARGSUSED */
 static int
-drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	dld_ioc_attr_t		*diap = karg;
 	dls_dl_handle_t		dlh;
-	dls_vlan_t		*dvp;
+	dls_link_t		*dlp;
 	int			err;
+	mac_perim_handle_t	mph;
 
 	if ((err = dls_devnet_hold_tmp(diap->dia_linkid, &dlh)) != 0)
 		return (err);
 
-	if ((err = dls_vlan_hold(dls_devnet_mac(dlh),
-	    dls_devnet_vid(dlh), &dvp, B_FALSE, B_FALSE)) != 0) {
+	if ((err = mac_perim_enter_by_macname(
+	    dls_devnet_mac(dlh), &mph)) != 0) {
 		dls_devnet_rele_tmp(dlh);
 		return (err);
 	}
-	mac_sdu_get(dvp->dv_dlp->dl_mh, NULL, &diap->dia_max_sdu);
 
-	dls_vlan_rele(dvp);
+	if ((err = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) {
+		mac_perim_exit(mph);
+		dls_devnet_rele_tmp(dlh);
+		return (err);
+	}
+
+	mac_sdu_get(dlp->dl_mh, NULL, &diap->dia_max_sdu);
+
+	dls_link_rele(dlp);
+	mac_perim_exit(mph);
 	dls_devnet_rele_tmp(dlh);
 
 	return (0);
@@ -342,7 +354,7 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred)
  */
 /* ARGSUSED */
 static int
-drv_ioc_phys_attr(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_phys_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	dld_ioc_phys_attr_t	*dipp = karg;
 	int			err;
@@ -387,64 +399,184 @@ drv_ioc_phys_attr(void *karg, intptr_t arg, int mode, cred_t *cred)
 	return (0);
 }
 
+/* ARGSUSED */
+static int
+drv_ioc_hwgrpget(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+	dld_ioc_hwgrpget_t	*hwgrpp = karg;
+	dld_hwgrpinfo_t		hwgrp, *hip;
+	mac_handle_t		mh = NULL;
+	int			i, err, grpnum;
+	uint_t			bytes_left;
+
+	hwgrpp->dih_n_groups = 0;
+	err = mac_open_by_linkid(hwgrpp->dih_linkid, &mh);
+	if (err != 0)
+		goto done;
+
+	hip = (dld_hwgrpinfo_t *)
+	    ((uchar_t *)arg + sizeof (dld_ioc_hwgrpget_t));
+	bytes_left = hwgrpp->dih_size;
+	grpnum = mac_hwgrp_num(mh);
+	for (i = 0; i < grpnum; i++) {
+		if (sizeof (dld_hwgrpinfo_t) > bytes_left) {
+			err = ENOSPC;
+			goto done;
+		}
+
+		bzero(&hwgrp, sizeof (hwgrp));
+		bcopy(mac_name(mh), hwgrp.dhi_link_name,
+		    sizeof (hwgrp.dhi_link_name));
+		mac_get_hwgrp_info(mh, i, &hwgrp.dhi_grp_num,
+		    &hwgrp.dhi_n_rings, &hwgrp.dhi_grp_type,
+		    &hwgrp.dhi_n_clnts, hwgrp.dhi_clnts);
+		if (copyout(&hwgrp, hip, sizeof (hwgrp)) != 0) {
+			err = EFAULT;
+			goto done;
+		}
+
+		hip++;
+		bytes_left -= sizeof (dld_hwgrpinfo_t);
+	}
+
+done:
+	if (mh != NULL)
+		dld_mac_close(mh);
+	if (err == 0)
+		hwgrpp->dih_n_groups = grpnum;
+	return (err);
+}
+
+/* ARGSUSED */
+static int
+drv_ioc_macaddrget(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+	dld_ioc_macaddrget_t	*magp = karg;
+	dld_macaddrinfo_t	mai, *maip;
+	mac_handle_t		mh = NULL;
+	int			i, err;
+	uint_t			bytes_left;
+	boolean_t		is_used;
+
+	magp->dig_count = 0;
+	err = mac_open_by_linkid(magp->dig_linkid, &mh);
+	if (err != 0)
+		goto done;
+
+	maip = (dld_macaddrinfo_t *)
+	    ((uchar_t *)arg + sizeof (dld_ioc_macaddrget_t));
+	bytes_left = magp->dig_size;
+
+	for (i = 0; i < mac_addr_factory_num(mh) + 1; i++) {
+		if (sizeof (dld_macaddrinfo_t) > bytes_left) {
+			err = ENOSPC;
+			goto done;
+		}
+
+		bzero(&mai, sizeof (mai));
+
+		if (i == 0) {
+			/* primary MAC address */
+			mac_unicast_primary_get(mh, mai.dmi_addr);
+			mai.dmi_addrlen = mac_addr_len(mh);
+			mac_unicast_primary_info(mh, mai.dmi_client_name,
+			    &is_used);
+		} else {
+			/* factory MAC address slot */
+			mac_addr_factory_value(mh, i, mai.dmi_addr,
+			    &mai.dmi_addrlen, mai.dmi_client_name, &is_used);
+		}
+
+		mai.dmi_slot = i;
+		if (is_used)
+			mai.dmi_flags |= DLDIOCMACADDR_USED;
+
+		if (copyout(&mai, maip, sizeof (mai)) != 0) {
+			err = EFAULT;
+			goto done;
+		}
+
+		maip++;
+		bytes_left -= sizeof (dld_macaddrinfo_t);
+	}
+
+done:
+	if (mh != NULL)
+		dld_mac_close(mh);
+	if (err == 0)
+		magp->dig_count = mac_addr_factory_num(mh) + 1;
+	return (err);
+}
+
 /*
- * DLDIOC_SETPROP
+ * DLDIOC_SET/GETPROP
  */
 static int
-drv_ioc_prop_common(dld_ioc_macprop_t *dipp, intptr_t arg, boolean_t set,
+drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set,
     int mode)
 {
-	int		err = EINVAL;
-	size_t		dsize;
-	dld_ioc_macprop_t	*kdipp;
-	dls_dl_handle_t		dlh;
-	dls_vlan_t		*dvp;
-	datalink_id_t 		linkid;
+	int			err = EINVAL;
+	dls_dl_handle_t 	dlh = NULL;
+	dls_link_t		*dlp = NULL;
+	mac_perim_handle_t	mph = NULL;
 	mac_prop_t		macprop;
-	uchar_t			*cp;
-	struct dlautopush	*dlap;
-	dld_ioc_zid_t		*dzp;
+	dld_ioc_macprop_t	*kprop;
+	datalink_id_t		linkid;
+	uint_t			dsize;
+
 
 	/*
-	 * We only use pr_valsize from dipp, as the caller only did a
+	 * We only use pr_valsize from prop, as the caller only did a
 	 * copyin() for sizeof (dld_ioc_prop_t), which doesn't cover
 	 * the property data.  We copyin the full dld_ioc_prop_t
-	 * including the data into kdipp down below.
+	 * including the data into kprop down below.
 	 */
-	dsize = sizeof (dld_ioc_macprop_t) + dipp->pr_valsize - 1;
-	if (dsize < dipp->pr_valsize)
+	dsize = sizeof (dld_ioc_macprop_t) + prop->pr_valsize - 1;
+	if (dsize < prop->pr_valsize)
 		return (EINVAL);
 
 	/*
 	 * The property data is variable size, so we need to allocate
 	 * a buffer for kernel use as this data was not part of the
-	 * dipp allocation and copyin() done by the framework.
+	 * prop allocation and copyin() done by the framework.
 	 */
-	if ((kdipp = kmem_alloc(dsize, KM_NOSLEEP)) == NULL)
+	if ((kprop = kmem_alloc(dsize, KM_NOSLEEP)) == NULL)
 		return (ENOMEM);
-	if (ddi_copyin((void *)arg, kdipp, dsize, mode) != 0) {
+
+	if (ddi_copyin((void *)arg, kprop, dsize, mode) != 0) {
 		err = EFAULT;
 		goto done;
 	}
 
-	linkid = kdipp->pr_linkid;
+	linkid = kprop->pr_linkid;
+	if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0)
+		goto done;
+
+	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(dlh),
+	    &mph)) != 0) {
+		goto done;
+	}
 
-	switch (dipp->pr_num) {
-	case MAC_PROP_ZONE:
+	switch (kprop->pr_num) {
+	case MAC_PROP_ZONE: {
 		if (set) {
-			dzp = (dld_ioc_zid_t *)kdipp->pr_val;
+			dld_ioc_zid_t	*dzp = (dld_ioc_zid_t *)kprop->pr_val;
+
 			err = dls_devnet_setzid(dzp->diz_link, dzp->diz_zid);
 			goto done;
 		} else {
-			kdipp->pr_perm_flags = MAC_PROP_PERM_RW;
-			cp = (uchar_t *)kdipp->pr_val;
-			err = dls_devnet_getzid(linkid, (zoneid_t *)cp);
+			kprop->pr_perm_flags = MAC_PROP_PERM_RW;
+			err = dls_devnet_getzid(linkid,
+			    (zoneid_t *)kprop->pr_val);
 			goto done;
 		}
-	case MAC_PROP_AUTOPUSH:
+	}
+	case MAC_PROP_AUTOPUSH: {
+		struct dlautopush	*dlap =
+		    (struct dlautopush *)kprop->pr_val;
+
 		if (set) {
-			if (dipp->pr_valsize != 0) {
-				dlap = (struct dlautopush *)kdipp->pr_val;
+			if (kprop->pr_valsize != 0) {
 				err = drv_ioc_setap(linkid, dlap);
 				goto done;
 			} else {
@@ -452,125 +584,73 @@ drv_ioc_prop_common(dld_ioc_macprop_t *dipp, intptr_t arg, boolean_t set,
 				goto done;
 			}
 		} else {
-			kdipp->pr_perm_flags = MAC_PROP_PERM_RW;
-			dlap = (struct dlautopush *)kdipp->pr_val;
+			kprop->pr_perm_flags = MAC_PROP_PERM_RW;
 			err = drv_ioc_getap(linkid, dlap);
 			goto done;
 		}
-
+	}
 	default:
 		break;
 	}
 
-	if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0)
-		goto done;
-
-	if ((err = dls_vlan_hold(dls_devnet_mac(dlh),
-	    dls_devnet_vid(dlh), &dvp, B_FALSE, B_FALSE)) != 0) {
-		dls_devnet_rele_tmp(dlh);
+	if ((err = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0)
 		goto done;
-	}
 
-	macprop.mp_name = kdipp->pr_name;
-	macprop.mp_id = kdipp->pr_num;
-	macprop.mp_flags = kdipp->pr_flags;
+	macprop.mp_name = kprop->pr_name;
+	macprop.mp_id = kprop->pr_num;
+	macprop.mp_flags = kprop->pr_flags;
 
 	if (set) {
-		err = mac_set_prop(dvp->dv_dlp->dl_mh, &macprop,
-		    kdipp->pr_val, kdipp->pr_valsize);
+		err = mac_set_prop(dlp->dl_mh, &macprop, kprop->pr_val,
+		    kprop->pr_valsize);
 	} else {
-		kdipp->pr_perm_flags = MAC_PROP_PERM_RW;
-		err = mac_get_prop(dvp->dv_dlp->dl_mh, &macprop,
-		    kdipp->pr_val, kdipp->pr_valsize, &kdipp->pr_perm_flags);
+		kprop->pr_perm_flags = MAC_PROP_PERM_RW;
+		err = mac_get_prop(dlp->dl_mh, &macprop, kprop->pr_val,
+		    kprop->pr_valsize, &kprop->pr_perm_flags);
 	}
 
-	dls_vlan_rele(dvp);
-	dls_devnet_rele_tmp(dlh);
 done:
 	if (!set && err == 0 &&
-	    ddi_copyout(kdipp, (void *)arg, dsize, mode) != 0)
+	    ddi_copyout(kprop, (void *)arg, dsize, mode) != 0)
 		err = EFAULT;
-	kmem_free(kdipp, dsize);
-	return (err);
-}
 
-/* ARGSUSED */
-static int
-drv_ioc_setprop(void *karg, intptr_t arg, int mode, cred_t *cred)
-{
-	return (drv_ioc_prop_common(karg, arg, B_TRUE, mode));
-}
+	if (dlp != NULL)
+		dls_link_rele(dlp);
 
-/* ARGSUSED */
-static int
-drv_ioc_getprop(void *karg, intptr_t arg, int mode, cred_t *cred)
-{
-	return (drv_ioc_prop_common(karg, arg, B_FALSE, mode));
-}
+	if (mph != NULL) {
+		int32_t	cpuid;
+		void	*mdip = NULL;
 
-/*
- * DLDIOC_CREATE_VLAN
- */
-/* ARGSUSED */
-static int
-drv_ioc_create_vlan(void *karg, intptr_t arg, int mode, cred_t *cred)
-{
-	dld_ioc_create_vlan_t	*dicp = karg;
+		if (dlp != NULL && set && err == 0) {
+			cpuid = mac_client_intr_cpu(dlp->dl_mch);
+			mdip = mac_get_devinfo(dlp->dl_mh);
+		}
 
-	return (dls_devnet_create_vlan(dicp->dic_vlanid, dicp->dic_linkid,
-	    dicp->dic_vid, dicp->dic_force));
+		mac_perim_exit(mph);
+
+		if (mdip != NULL)
+			mac_client_set_intr_cpu(mdip, dlp->dl_mch, cpuid);
+	}
+	if (dlh != NULL)
+		dls_devnet_rele_tmp(dlh);
+
+	if (kprop != NULL)
+		kmem_free(kprop, dsize);
+	return (err);
 }
 
-/*
- * DLDIOC_DELETE_VLAN
- */
 /* ARGSUSED */
 static int
-drv_ioc_delete_vlan(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
-	dld_ioc_delete_vlan_t	*didp = karg;
-
-	return (dls_devnet_destroy_vlan(didp->did_linkid));
+	return (drv_ioc_prop_common(karg, arg, B_TRUE, mode));
 }
 
-/*
- * DLDIOC_VLAN_ATTR
- */
 /* ARGSUSED */
 static int
-drv_ioc_vlan_attr(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
-	dld_ioc_vlan_attr_t	*divp = karg;
-	dls_dl_handle_t		dlh;
-	uint16_t		vid;
-	dls_vlan_t		*dvp;
-	int			err;
-
-	/*
-	 * Hold this link to prevent it from being deleted.
-	 */
-	if ((err = dls_devnet_hold_tmp(divp->div_vlanid, &dlh)) != 0)
-		return (err);
-
-	if ((vid = dls_devnet_vid(dlh)) == VLAN_ID_NONE) {
-		dls_devnet_rele_tmp(dlh);
-		return (EINVAL);
-	}
-
-	err = dls_vlan_hold(dls_devnet_mac(dlh), vid, &dvp, B_FALSE, B_FALSE);
-	if (err != 0) {
-		dls_devnet_rele_tmp(dlh);
-		return (err);
-	}
-
-	divp->div_linkid = dls_devnet_linkid(dlh);
-	divp->div_implicit = !dls_devnet_is_explicit(dlh);
-	divp->div_vid = vid;
-	divp->div_force = dvp->dv_force;
-
-	dls_vlan_rele(dvp);
-	dls_devnet_rele_tmp(dlh);
-	return (0);
+	return (drv_ioc_prop_common(karg, arg, B_FALSE, mode));
 }
 
 /*
@@ -581,7 +661,7 @@ drv_ioc_vlan_attr(void *karg, intptr_t arg, int mode, cred_t *cred)
  */
 /* ARGSUSED */
 static int
-drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	dld_ioc_rename_t	*dir = karg;
 	mod_hash_key_t		key;
@@ -719,7 +799,7 @@ drv_ioc_clrap(datalink_id_t linkid)
  */
 /* ARGSUSED */
 static int
-drv_ioc_doorserver(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_doorserver(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	dld_ioc_door_t	*did = karg;
 
@@ -727,6 +807,76 @@ drv_ioc_doorserver(void *karg, intptr_t arg, int mode, cred_t *cred)
 }
 
 /*
+ * DLDIOC_USAGELOG
+ */
+/* ARGSUSED */
+static int
+drv_ioc_usagelog(void *karg, intptr_t arg, int mode, cred_t *cred,
+    int *rvalp)
+{
+	dld_ioc_usagelog_t	*log_info = (dld_ioc_usagelog_t *)karg;
+
+	if (log_info->ul_type < MAC_LOGTYPE_LINK ||
+	    log_info->ul_type > MAC_LOGTYPE_FLOW)
+		return (EINVAL);
+
+	if (log_info->ul_onoff)
+		mac_start_logusage(log_info->ul_type, log_info->ul_interval);
+	else
+		mac_stop_logusage(log_info->ul_type);
+	return (0);
+}
+
+/*
+ * Process a DLDIOC_ADDFLOW request.
+ */
+/* ARGSUSED */
+static int
+drv_ioc_addflow(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+	dld_ioc_addflow_t	*afp = karg;
+
+	return (dld_add_flow(afp->af_linkid, afp->af_name,
+	    &afp->af_flow_desc, &afp->af_resource_props));
+}
+
+/*
+ * Process a DLDIOC_REMOVEFLOW request.
+ */
+/* ARGSUSED */
+static int
+drv_ioc_removeflow(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+	dld_ioc_removeflow_t	*rfp = karg;
+
+	return (dld_remove_flow(rfp->rf_name));
+}
+
+/*
+ * Process a DLDIOC_MODIFYFLOW request.
+ */
+/* ARGSUSED */
+static int
+drv_ioc_modifyflow(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+	dld_ioc_modifyflow_t	*mfp = karg;
+
+	return (dld_modify_flow(mfp->mf_name, &mfp->mf_resource_props));
+}
+
+/*
+ * Process a DLDIOC_WALKFLOW request.
+ */
+/* ARGSUSED */
+static int
+drv_ioc_walkflow(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+	dld_ioc_walkflow_t	*wfp = karg;
+
+	return (dld_walk_flow(wfp, arg));
+}
+
+/*
  * Check for GLDv3 autopush information.  There are three cases:
  *
  *   1. If devp points to a GLDv3 datalink and it has autopush configuration,
@@ -809,7 +959,7 @@ drv_secobj_fini(void)
 
 /* ARGSUSED */
 static int
-drv_ioc_secobj_set(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_secobj_set(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	dld_ioc_secobj_set_t	*ssp = karg;
 	dld_secobj_t		*sobjp, *objp;
@@ -885,14 +1035,13 @@ drv_secobj_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
 
 /* ARGSUSED */
 static int
-drv_ioc_secobj_get(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_secobj_get(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	dld_ioc_secobj_get_t	*sgp = karg;
 	dld_secobj_t		*sobjp, *objp;
 	int			err;
 
 	sobjp = &sgp->sg_obj;
-
 	if (sobjp->so_name[DLD_SECOBJ_NAME_MAX - 1] != '\0')
 		return (EINVAL);
 
@@ -932,7 +1081,8 @@ drv_ioc_secobj_get(void *karg, intptr_t arg, int mode, cred_t *cred)
 
 /* ARGSUSED */
 static int
-drv_ioc_secobj_unset(void *karg, intptr_t arg, int mode, cred_t *cred)
+drv_ioc_secobj_unset(void *karg, intptr_t arg, int mode, cred_t *cred,
+    int *rvalp)
 {
 	dld_ioc_secobj_unset_t	*sup = karg;
 	dld_secobj_t		*objp;
@@ -959,32 +1109,56 @@ drv_ioc_secobj_unset(void *karg, intptr_t arg, int mode, cred_t *cred)
 	return (0);
 }
 
+static int
+drv_check_policy(dld_ioc_info_t *info, cred_t *cred)
+{
+	int	i, err = 0;
+
+	for (i = 0; info->di_priv[i] != NULL && i < DLD_MAX_PRIV; i++) {
+		if ((err = secpolicy_dld_ioctl(cred, info->di_priv[i],
+		    "dld ioctl")) != 0) {
+			break;
+		}
+	}
+	if (err == 0)
+		return (0);
+
+	return (secpolicy_net_config(cred, B_FALSE));
+}
+
 static dld_ioc_info_t drv_ioc_list[] = {
 	{DLDIOC_ATTR, DLDCOPYINOUT, sizeof (dld_ioc_attr_t),
-	    drv_ioc_attr},
+	    drv_ioc_attr, {NULL}},
 	{DLDIOC_PHYS_ATTR, DLDCOPYINOUT, sizeof (dld_ioc_phys_attr_t),
-	    drv_ioc_phys_attr},
-	{DLDIOC_SECOBJ_SET, DLDCOPYIN | DLDDLCONFIG,
-	    sizeof (dld_ioc_secobj_set_t), drv_ioc_secobj_set},
-	{DLDIOC_SECOBJ_GET, DLDCOPYINOUT | DLDDLCONFIG,
-	    sizeof (dld_ioc_secobj_get_t), drv_ioc_secobj_get},
-	{DLDIOC_SECOBJ_UNSET, DLDCOPYIN | DLDDLCONFIG,
-	    sizeof (dld_ioc_secobj_unset_t), drv_ioc_secobj_unset},
-	{DLDIOC_CREATE_VLAN, DLDCOPYIN | DLDDLCONFIG,
-	    sizeof (dld_ioc_create_vlan_t), drv_ioc_create_vlan},
-	{DLDIOC_DELETE_VLAN, DLDCOPYIN | DLDDLCONFIG,
-	    sizeof (dld_ioc_delete_vlan_t),
-	    drv_ioc_delete_vlan},
-	{DLDIOC_VLAN_ATTR, DLDCOPYINOUT, sizeof (dld_ioc_vlan_attr_t),
-	    drv_ioc_vlan_attr},
-	{DLDIOC_DOORSERVER, DLDCOPYIN | DLDDLCONFIG, sizeof (dld_ioc_door_t),
-	    drv_ioc_doorserver},
-	{DLDIOC_RENAME, DLDCOPYIN | DLDDLCONFIG, sizeof (dld_ioc_rename_t),
-	    drv_ioc_rename},
+	    drv_ioc_phys_attr, {NULL}},
+	{DLDIOC_SECOBJ_SET, DLDCOPYIN, sizeof (dld_ioc_secobj_set_t),
+	    drv_ioc_secobj_set, {PRIV_SYS_DL_CONFIG}},
+	{DLDIOC_SECOBJ_GET, DLDCOPYINOUT, sizeof (dld_ioc_secobj_get_t),
+	    drv_ioc_secobj_get, {PRIV_SYS_DL_CONFIG}},
+	{DLDIOC_SECOBJ_UNSET, DLDCOPYIN, sizeof (dld_ioc_secobj_unset_t),
+	    drv_ioc_secobj_unset, {PRIV_SYS_DL_CONFIG}},
+	{DLDIOC_DOORSERVER, DLDCOPYIN, sizeof (dld_ioc_door_t),
+	    drv_ioc_doorserver, {PRIV_SYS_DL_CONFIG}},
+	{DLDIOC_RENAME, DLDCOPYIN, sizeof (dld_ioc_rename_t),
+	    drv_ioc_rename, {PRIV_SYS_DL_CONFIG}},
+	{DLDIOC_MACADDRGET, DLDCOPYINOUT, sizeof (dld_ioc_macaddrget_t),
+	    drv_ioc_macaddrget, {PRIV_SYS_DL_CONFIG}},
+	{DLDIOC_ADDFLOW, DLDCOPYIN, sizeof (dld_ioc_addflow_t),
+	    drv_ioc_addflow, {PRIV_SYS_DL_CONFIG}},
+	{DLDIOC_REMOVEFLOW, DLDCOPYIN, sizeof (dld_ioc_removeflow_t),
+	    drv_ioc_removeflow, {PRIV_SYS_DL_CONFIG}},
+	{DLDIOC_MODIFYFLOW, DLDCOPYIN, sizeof (dld_ioc_modifyflow_t),
+	    drv_ioc_modifyflow, {PRIV_SYS_DL_CONFIG}},
+	{DLDIOC_WALKFLOW, DLDCOPYINOUT, sizeof (dld_ioc_walkflow_t),
+	    drv_ioc_walkflow, {NULL}},
+	{DLDIOC_USAGELOG, DLDCOPYIN, sizeof (dld_ioc_usagelog_t),
+	    drv_ioc_usagelog, {PRIV_SYS_DL_CONFIG}},
+	{DLDIOC_SETMACPROP, DLDCOPYIN, sizeof (dld_ioc_macprop_t),
+	    drv_ioc_setprop, {PRIV_SYS_DL_CONFIG}},
 	{DLDIOC_GETMACPROP, DLDCOPYIN, sizeof (dld_ioc_macprop_t),
-	    drv_ioc_getprop},
-	{DLDIOC_SETMACPROP, DLDCOPYIN | DLDDLCONFIG, sizeof (dld_ioc_macprop_t),
-	    drv_ioc_setprop}
+	    drv_ioc_getprop, {NULL}},
+	{DLDIOC_GETHWGRP, DLDCOPYINOUT, sizeof (dld_ioc_hwgrpget_t),
+	    drv_ioc_hwgrpget, {PRIV_SYS_DL_CONFIG}},
 };
 
 typedef struct dld_ioc_modentry {
@@ -1090,11 +1264,8 @@ drv_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 	}
 
 	info = &dim->dim_list[i];
-
-	if ((info->di_flags & DLDDLCONFIG) && secpolicy_dl_config(cred) != 0) {
-		err = EPERM;
+	if ((err = drv_check_policy(info, cred)) != 0)
 		goto done;
-	}
 
 	sz = info->di_argsize;
 	if ((buf = kmem_zalloc(sz, KM_NOSLEEP)) == NULL) {
@@ -1108,7 +1279,7 @@ drv_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 		goto done;
 	}
 
-	err = info->di_func(buf, arg, mode, cred);
+	err = info->di_func(buf, arg, mode, cred, rvalp);
 
 	if ((info->di_flags & DLDCOPYOUT) &&
 	    ddi_copyout(buf, (void *)arg, sz, mode) != 0 && err == 0)
diff --git a/usr/src/uts/common/io/dld/dld_flow.c b/usr/src/uts/common/io/dld/dld_flow.c
new file mode 100644
index 0000000000..b57368484f
--- /dev/null
+++ b/usr/src/uts/common/io/dld/dld_flow.c
@@ -0,0 +1,119 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Flows ioctls implementation.
+ */
+
+#include <sys/dld.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
+
+/*
+ * Implements flow add, remove, modify ioctls.
+ */
+int
+dld_add_flow(datalink_id_t linkid, char *flow_name, flow_desc_t *flow_desc,
+    mac_resource_props_t *mrp)
+{
+	return (mac_link_flow_add(linkid, flow_name, flow_desc, mrp));
+}
+
+int
+dld_remove_flow(char *flow_name)
+{
+	return (mac_link_flow_remove(flow_name));
+}
+
+int
+dld_modify_flow(char *flow_name, mac_resource_props_t *mrp)
+{
+	return (mac_link_flow_modify(flow_name, mrp));
+}
+
+
+/*
+ * Callback function and structure used by dld_walk_flow().
+ */
+typedef struct flowinfo_state_s {
+	int			fi_bufsize;
+	int			fi_nflows;
+	uchar_t			*fi_fl;
+} flowinfo_state_t;
+
+static int
+dld_walk_flow_cb(mac_flowinfo_t *finfo, void *arg)
+{
+	flowinfo_state_t		*statep = arg;
+	dld_flowinfo_t			fi;
+
+	if (statep->fi_bufsize < sizeof (dld_flowinfo_t))
+		return (ENOSPC);
+
+	(void) strlcpy(fi.fi_flowname, finfo->fi_flow_name,
+	    sizeof (fi.fi_flowname));
+	fi.fi_linkid = finfo->fi_link_id;
+	fi.fi_flow_desc = finfo->fi_flow_desc;
+	fi.fi_resource_props = finfo->fi_resource_props;
+
+	if (copyout(&fi, statep->fi_fl, sizeof (fi)) != 0) {
+		return (EFAULT);
+	}
+	statep->fi_nflows++;
+	statep->fi_bufsize -= sizeof (dld_flowinfo_t);
+	statep->fi_fl += sizeof (dld_flowinfo_t);
+	return (0);
+}
+
+/*
+ * Implements flow walk ioctl.
+ * Retrieves a specific flow or a list of flows from the specified link.
+ * ENOSPC is returned a bigger buffer is needed.
+ */
+int
+dld_walk_flow(dld_ioc_walkflow_t *wf, intptr_t uaddr)
+{
+	flowinfo_state_t	state;
+	mac_flowinfo_t		finfo;
+	int			err = 0;
+
+	state.fi_bufsize = wf->wf_len;
+	state.fi_fl = (uchar_t *)uaddr + sizeof (*wf);
+	state.fi_nflows = 0;
+
+	if (wf->wf_name[0] == '\0') {
+		err = mac_link_flow_walk(wf->wf_linkid, dld_walk_flow_cb,
+		    &state);
+	} else {
+		err = mac_link_flow_info(wf->wf_name, &finfo);
+		if (err != 0)
+			return (err);
+
+		err = dld_walk_flow_cb(&finfo, &state);
+	}
+	wf->wf_nflows = state.fi_nflows;
+	return (err);
+}
diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c
index 5bc1fc5322..2c3d0f7ecb 100644
--- a/usr/src/uts/common/io/dld/dld_proto.c
+++ b/usr/src/uts/common/io/dld/dld_proto.c
@@ -23,32 +23,19 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Data-Link Driver
  */
-
-#include <sys/types.h>
-#include <sys/debug.h>
 #include <sys/sysmacros.h>
-#include <sys/stream.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/strsun.h>
-#include <sys/cpuvar.h>
-#include <sys/dlpi.h>
-#include <netinet/in.h>
-#include <sys/sdt.h>
 #include <sys/strsubr.h>
+#include <sys/strsun.h>
 #include <sys/vlan.h>
-#include <sys/mac.h>
-#include <sys/dls.h>
-#include <sys/dld.h>
 #include <sys/dld_impl.h>
-#include <sys/dls_soft_ring.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_client_priv.h>
 
-typedef boolean_t proto_reqfunc_t(dld_str_t *, union DL_primitives *, mblk_t *);
+typedef void proto_reqfunc_t(dld_str_t *, mblk_t *);
 
 static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req,
     proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req,
@@ -56,13 +43,8 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req,
     proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req,
     proto_notify_req, proto_passive_req;
 
-static void proto_poll_disable(dld_str_t *);
-static boolean_t proto_poll_enable(dld_str_t *, dl_capab_dls_t *);
-
-static void proto_soft_ring_disable(dld_str_t *);
-static boolean_t proto_soft_ring_enable(dld_str_t *, dl_capab_dls_t *);
-static boolean_t proto_capability_advertise(dld_str_t *, mblk_t *);
-static void proto_change_soft_ring_fanout(dld_str_t *, int);
+static void proto_capability_advertise(dld_str_t *, mblk_t *);
+static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *);
 
 #define	DL_ACK_PENDING(state) \
 	((state) == DL_ATTACH_PENDING || \
@@ -79,70 +61,72 @@ static void proto_change_soft_ring_fanout(dld_str_t *, int);
  * by the above primitives.
  */
 void
-dld_wput_proto_nondata(dld_str_t *dsp, mblk_t *mp)
+dld_proto(dld_str_t *dsp, mblk_t *mp)
 {
-	union DL_primitives	*udlp;
 	t_uscalar_t		prim;
 
-	ASSERT(MBLKL(mp) >= sizeof (t_uscalar_t));
-
-	udlp = (union DL_primitives *)mp->b_rptr;
-	prim = udlp->dl_primitive;
+	if (MBLKL(mp) < sizeof (t_uscalar_t)) {
+		freemsg(mp);
+		return;
+	}
+	prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
 
 	switch (prim) {
 	case DL_INFO_REQ:
-		(void) proto_info_req(dsp, udlp, mp);
+		proto_info_req(dsp, mp);
 		break;
 	case DL_BIND_REQ:
-		(void) proto_bind_req(dsp, udlp, mp);
+		proto_bind_req(dsp, mp);
 		break;
 	case DL_UNBIND_REQ:
-		(void) proto_unbind_req(dsp, udlp, mp);
+		proto_unbind_req(dsp, mp);
+		break;
+	case DL_UNITDATA_REQ:
+		proto_unitdata_req(dsp, mp);
 		break;
 	case DL_UDQOS_REQ:
-		(void) proto_udqos_req(dsp, udlp, mp);
+		proto_udqos_req(dsp, mp);
 		break;
 	case DL_ATTACH_REQ:
-		(void) proto_attach_req(dsp, udlp, mp);
+		proto_attach_req(dsp, mp);
 		break;
 	case DL_DETACH_REQ:
-		(void) proto_detach_req(dsp, udlp, mp);
+		proto_detach_req(dsp, mp);
 		break;
 	case DL_ENABMULTI_REQ:
-		(void) proto_enabmulti_req(dsp, udlp, mp);
+		proto_enabmulti_req(dsp, mp);
 		break;
 	case DL_DISABMULTI_REQ:
-		(void) proto_disabmulti_req(dsp, udlp, mp);
+		proto_disabmulti_req(dsp, mp);
 		break;
 	case DL_PROMISCON_REQ:
-		(void) proto_promiscon_req(dsp, udlp, mp);
+		proto_promiscon_req(dsp, mp);
 		break;
 	case DL_PROMISCOFF_REQ:
-		(void) proto_promiscoff_req(dsp, udlp, mp);
+		proto_promiscoff_req(dsp, mp);
 		break;
 	case DL_PHYS_ADDR_REQ:
-		(void) proto_physaddr_req(dsp, udlp, mp);
+		proto_physaddr_req(dsp, mp);
 		break;
 	case DL_SET_PHYS_ADDR_REQ:
-		(void) proto_setphysaddr_req(dsp, udlp, mp);
+		proto_setphysaddr_req(dsp, mp);
 		break;
 	case DL_NOTIFY_REQ:
-		(void) proto_notify_req(dsp, udlp, mp);
+		proto_notify_req(dsp, mp);
 		break;
 	case DL_CAPABILITY_REQ:
-		(void) proto_capability_req(dsp, udlp, mp);
+		proto_capability_req(dsp, mp);
 		break;
 	case DL_PASSIVE_REQ:
-		(void) proto_passive_req(dsp, udlp, mp);
+		proto_passive_req(dsp, mp);
 		break;
 	default:
-		(void) proto_req(dsp, udlp, mp);
+		proto_req(dsp, mp);
 		break;
 	}
 }
 
 #define	NEG(x)	-(x)
-
 typedef struct dl_info_ack_wrapper {
 	dl_info_ack_t		dl_info;
 	uint8_t			dl_addr[MAXMACADDRLEN + sizeof (uint16_t)];
@@ -154,9 +138,8 @@ typedef struct dl_info_ack_wrapper {
 /*
  * DL_INFO_REQ
  */
-/*ARGSUSED*/
-static boolean_t
-proto_info_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_info_req(dld_str_t *dsp, mblk_t *mp)
 {
 	dl_info_ack_wrapper_t	*dlwp;
 	dl_info_ack_t		*dlp;
@@ -176,9 +159,7 @@ proto_info_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 	 */
 	if ((mp = mexchange(q, mp, sizeof (dl_info_ack_wrapper_t),
 	    M_PCPROTO, 0)) == NULL)
-		return (B_FALSE);
-
-	rw_enter(&dsp->ds_lock, RW_READER);
+		return;
 
 	bzero(mp->b_rptr, sizeof (dl_info_ack_wrapper_t));
 	dlwp = (dl_info_ack_wrapper_t *)mp->b_rptr;
@@ -307,7 +288,8 @@ proto_info_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 		 */
 		dlp->dl_addr_offset = (uintptr_t)addr - (uintptr_t)dlp;
 		if (addr_length > 0)
-			bcopy(dsp->ds_curr_addr, addr, addr_length);
+			mac_unicast_primary_get(dsp->ds_mh, addr);
+
 		*(uint16_t *)(addr + addr_length) = dsp->ds_sap;
 	}
 
@@ -319,25 +301,20 @@ done:
 	ASSERT(IMPLY(dlp->dl_brdcst_addr_offset != 0,
 	    dlp->dl_brdcst_addr_length != 0));
 
-	rw_exit(&dsp->ds_lock);
-
 	qreply(q, mp);
-	return (B_TRUE);
 }
 
 /*
  * DL_ATTACH_REQ
  */
-static boolean_t
-proto_attach_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_attach_req(dld_str_t *dsp, mblk_t *mp)
 {
-	dl_attach_req_t	*dlp = (dl_attach_req_t *)udlp;
+	dl_attach_req_t	*dlp = (dl_attach_req_t *)mp->b_rptr;
 	int		err = 0;
 	t_uscalar_t	dl_err;
 	queue_t		*q = dsp->ds_wq;
 
-	rw_enter(&dsp->ds_lock, RW_WRITER);
-
 	if (MBLKL(mp) < sizeof (dl_attach_req_t) ||
 	    dlp->dl_ppa < 0 || dsp->ds_style == DL_STYLE1) {
 		dl_err = DL_BADPRIM;
@@ -366,25 +343,22 @@ proto_attach_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 		goto failed;
 	}
 	ASSERT(dsp->ds_dlstate == DL_UNBOUND);
-	rw_exit(&dsp->ds_lock);
-
 	dlokack(q, mp, DL_ATTACH_REQ);
-	return (B_TRUE);
+	return;
+
 failed:
-	rw_exit(&dsp->ds_lock);
 	dlerrorack(q, mp, DL_ATTACH_REQ, dl_err, (t_uscalar_t)err);
-	return (B_FALSE);
 }
 
-/*ARGSUSED*/
-static boolean_t
-proto_detach_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+/*
+ * DL_DETACH_REQ
+ */
+static void
+proto_detach_req(dld_str_t *dsp, mblk_t *mp)
 {
 	queue_t		*q = dsp->ds_wq;
 	t_uscalar_t	dl_err;
 
-	rw_enter(&dsp->ds_lock, RW_WRITER);
-
 	if (MBLKL(mp) < sizeof (dl_detach_req_t)) {
 		dl_err = DL_BADPRIM;
 		goto failed;
@@ -400,37 +374,34 @@ proto_detach_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 		goto failed;
 	}
 
+	ASSERT(dsp->ds_datathr_cnt == 0);
 	dsp->ds_dlstate = DL_DETACH_PENDING;
-	dld_str_detach(dsp);
 
-	rw_exit(&dsp->ds_lock);
+	dld_str_detach(dsp);
 	dlokack(dsp->ds_wq, mp, DL_DETACH_REQ);
-	return (B_TRUE);
+	return;
+
 failed:
-	rw_exit(&dsp->ds_lock);
 	dlerrorack(q, mp, DL_DETACH_REQ, dl_err, 0);
-	return (B_FALSE);
 }
 
 /*
  * DL_BIND_REQ
  */
-static boolean_t
-proto_bind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_bind_req(dld_str_t *dsp, mblk_t *mp)
 {
-	dl_bind_req_t	*dlp = (dl_bind_req_t *)udlp;
+	dl_bind_req_t	*dlp = (dl_bind_req_t *)mp->b_rptr;
 	int		err = 0;
 	uint8_t		dlsap_addr[MAXMACADDRLEN + sizeof (uint16_t)];
 	uint_t		dlsap_addr_length;
 	t_uscalar_t	dl_err;
 	t_scalar_t	sap;
 	queue_t		*q = dsp->ds_wq;
+	mac_perim_handle_t	mph;
+	void		*mdip;
+	int32_t		intr_cpu;
 
-	/*
-	 * Because control message processing is serialized, we don't need
-	 * to hold any locks to read any fields of dsp; we only need ds_lock
-	 * to update the ds_dlstate, ds_sap and ds_passivestate fields.
-	 */
 	if (MBLKL(mp) < sizeof (dl_bind_req_t)) {
 		dl_err = DL_BADPRIM;
 		goto failed;
@@ -451,24 +422,26 @@ proto_bind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 		goto failed;
 	}
 
+	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+
 	if (dsp->ds_passivestate == DLD_UNINITIALIZED &&
-	    !dls_active_set(dsp->ds_dc)) {
+	    ((err = dls_active_set(dsp)) != 0)) {
 		dl_err = DL_SYSERR;
-		err = EBUSY;
-		goto failed;
+		goto failed2;
 	}
 
+	dsp->ds_dlstate = DL_BIND_PENDING;
 	/*
 	 * Set the receive callback.
 	 */
-	dls_rx_set(dsp->ds_dc, (dsp->ds_mode == DLD_RAW) ?
+	dls_rx_set(dsp, (dsp->ds_mode == DLD_RAW) ?
 	    dld_str_rx_raw : dld_str_rx_unitdata, dsp);
 
 	/*
 	 * Bind the channel such that it can receive packets.
 	 */
 	sap = dlp->dl_sap;
-	err = dls_bind(dsp->ds_dc, sap);
+	err = dls_bind(dsp, sap);
 	if (err != 0) {
 		switch (err) {
 		case EINVAL:
@@ -480,17 +453,28 @@ proto_bind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 			break;
 		}
 
+		dsp->ds_dlstate = DL_UNBOUND;
 		if (dsp->ds_passivestate == DLD_UNINITIALIZED)
-			dls_active_clear(dsp->ds_dc);
-
-		goto failed;
+			dls_active_clear(dsp);
+		goto failed2;
 	}
 
+	intr_cpu = mac_client_intr_cpu(dsp->ds_mch);
+	mdip = mac_get_devinfo(dsp->ds_mh);
+	mac_perim_exit(mph);
+
+	/*
+	 * We do this after we get out of the perim to avoid deadlocks
+	 * etc. since part of mac_client_retarget_intr is to walk the
+	 * device tree in order to find and retarget the interrupts.
+	 */
+	mac_client_set_intr_cpu(mdip, dsp->ds_mch, intr_cpu);
+
 	/*
 	 * Copy in MAC address.
 	 */
 	dlsap_addr_length = dsp->ds_mip->mi_addr_length;
-	bcopy(dsp->ds_curr_addr, dlsap_addr, dlsap_addr_length);
+	mac_unicast_primary_get(dsp->ds_mh, dlsap_addr);
 
 	/*
 	 * Copy in the SAP.
@@ -498,37 +482,28 @@ proto_bind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 	*(uint16_t *)(dlsap_addr + dlsap_addr_length) = sap;
 	dlsap_addr_length += sizeof (uint16_t);
 
-	rw_enter(&dsp->ds_lock, RW_WRITER);
-
 	dsp->ds_dlstate = DL_IDLE;
 	if (dsp->ds_passivestate == DLD_UNINITIALIZED)
 		dsp->ds_passivestate = DLD_ACTIVE;
-	dsp->ds_sap = sap;
-
-	if (dsp->ds_mode == DLD_FASTPATH)
-		dsp->ds_tx = str_mdata_fastpath_put;
-	else if (dsp->ds_mode == DLD_RAW)
-		dsp->ds_tx = str_mdata_raw_put;
-	dsp->ds_unitdata_tx = dld_wput_proto_data;
-
-	rw_exit(&dsp->ds_lock);
 
 	dlbindack(q, mp, sap, dlsap_addr, dlsap_addr_length, 0, 0);
-	return (B_TRUE);
+	return;
+
+failed2:
+	mac_perim_exit(mph);
 failed:
 	dlerrorack(q, mp, DL_BIND_REQ, dl_err, (t_uscalar_t)err);
-	return (B_FALSE);
 }
 
 /*
  * DL_UNBIND_REQ
  */
-/*ARGSUSED*/
-static boolean_t
-proto_unbind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_unbind_req(dld_str_t *dsp, mblk_t *mp)
 {
 	queue_t		*q = dsp->ds_wq;
 	t_uscalar_t	dl_err;
+	mac_perim_handle_t	mph;
 
 	if (MBLKL(mp) < sizeof (dl_unbind_req_t)) {
 		dl_err = DL_BADPRIM;
@@ -540,32 +515,27 @@ proto_unbind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 		goto failed;
 	}
 
-	/*
-	 * Flush any remaining packets scheduled for transmission.
-	 */
-	dld_tx_flush(dsp);
+	mutex_enter(&dsp->ds_lock);
+	while (dsp->ds_datathr_cnt != 0)
+		cv_wait(&dsp->ds_datathr_cv, &dsp->ds_lock);
 
-	/*
-	 * Unbind the channel to stop packets being received.
-	 */
-	dls_unbind(dsp->ds_dc);
+	dsp->ds_dlstate = DL_UNBIND_PENDING;
+	mutex_exit(&dsp->ds_lock);
 
+	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
 	/*
-	 * Clear the receive callback.
+	 * Unbind the channel to stop packets being received.
 	 */
-	dls_rx_set(dsp->ds_dc, NULL, NULL);
-
-	rw_enter(&dsp->ds_lock, RW_WRITER);
+	if (dls_unbind(dsp) != 0) {
+		dl_err = DL_OUTSTATE;
+		mac_perim_exit(mph);
+		goto failed;
+	}
 
 	/*
 	 * Disable polling mode, if it is enabled.
 	 */
-	proto_poll_disable(dsp);
-
-	/*
-	 * If soft rings were enabled, the workers should be quiesced.
-	 */
-	dls_soft_ring_disable(dsp->ds_dc);
+	(void) dld_capab_poll_disable(dsp, NULL);
 
 	/*
 	 * Clear LSO flags.
@@ -574,38 +544,37 @@ proto_unbind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 	dsp->ds_lso_max = 0;
 
 	/*
+	 * Clear the receive callback.
+	 */
+	dls_rx_set(dsp, NULL, NULL);
+	dsp->ds_direct = B_FALSE;
+
+	/*
 	 * Set the mode back to the default (unitdata).
 	 */
 	dsp->ds_mode = DLD_UNITDATA;
 	dsp->ds_dlstate = DL_UNBOUND;
-	DLD_TX_QUIESCE(dsp);
-	rw_exit(&dsp->ds_lock);
-
-	dlokack(q, mp, DL_UNBIND_REQ);
 
-	return (B_TRUE);
+	mac_perim_exit(mph);
+	dlokack(dsp->ds_wq, mp, DL_UNBIND_REQ);
+	return;
 failed:
 	dlerrorack(q, mp, DL_UNBIND_REQ, dl_err, 0);
-	return (B_FALSE);
 }
 
 /*
  * DL_PROMISCON_REQ
  */
-static boolean_t
-proto_promiscon_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_promiscon_req(dld_str_t *dsp, mblk_t *mp)
 {
-	dl_promiscon_req_t *dlp = (dl_promiscon_req_t *)udlp;
+	dl_promiscon_req_t *dlp = (dl_promiscon_req_t *)mp->b_rptr;
 	int		err = 0;
 	t_uscalar_t	dl_err;
-	uint32_t	promisc;
+	uint32_t	promisc_saved;
 	queue_t		*q = dsp->ds_wq;
+	mac_perim_handle_t	mph;
 
-	/*
-	 * Because control message processing is serialized, we don't need
-	 * to hold any locks to read any fields of dsp; we only need ds_lock
-	 * to update the ds_promisc and ds_passivestate fields.
-	 */
 	if (MBLKL(mp) < sizeof (dl_promiscon_req_t)) {
 		dl_err = DL_BADPRIM;
 		goto failed;
@@ -617,70 +586,73 @@ proto_promiscon_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 		goto failed;
 	}
 
+	promisc_saved = dsp->ds_promisc;
 	switch (dlp->dl_level) {
 	case DL_PROMISC_SAP:
-		promisc = DLS_PROMISC_SAP;
+		dsp->ds_promisc |= DLS_PROMISC_SAP;
 		break;
+
 	case DL_PROMISC_MULTI:
-		promisc = DLS_PROMISC_MULTI;
+		dsp->ds_promisc |= DLS_PROMISC_MULTI;
 		break;
+
 	case DL_PROMISC_PHYS:
-		promisc = DLS_PROMISC_PHYS;
+		dsp->ds_promisc |= DLS_PROMISC_PHYS;
 		break;
+
 	default:
 		dl_err = DL_NOTSUPPORTED;
 		goto failed;
 	}
 
+	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+
 	if (dsp->ds_passivestate == DLD_UNINITIALIZED &&
-	    !dls_active_set(dsp->ds_dc)) {
+	    ((err = dls_active_set(dsp)) != 0)) {
+		dsp->ds_promisc = promisc_saved;
 		dl_err = DL_SYSERR;
-		err = EBUSY;
-		goto failed;
+		goto failed2;
 	}
 
 	/*
 	 * Adjust channel promiscuity.
 	 */
-	promisc = (dsp->ds_promisc | promisc);
-	err = dls_promisc(dsp->ds_dc, promisc);
+	err = dls_promisc(dsp, promisc_saved);
+
 	if (err != 0) {
 		dl_err = DL_SYSERR;
+		dsp->ds_promisc = promisc_saved;
 		if (dsp->ds_passivestate == DLD_UNINITIALIZED)
-			dls_active_clear(dsp->ds_dc);
-		goto failed;
+			dls_active_clear(dsp);
+		goto failed2;
 	}
 
-	rw_enter(&dsp->ds_lock, RW_WRITER);
+	mac_perim_exit(mph);
+
 	if (dsp->ds_passivestate == DLD_UNINITIALIZED)
 		dsp->ds_passivestate = DLD_ACTIVE;
-	dsp->ds_promisc = promisc;
-	rw_exit(&dsp->ds_lock);
-
 	dlokack(q, mp, DL_PROMISCON_REQ);
-	return (B_TRUE);
+	return;
+
+failed2:
+	mac_perim_exit(mph);
 failed:
 	dlerrorack(q, mp, DL_PROMISCON_REQ, dl_err, (t_uscalar_t)err);
-	return (B_FALSE);
 }
 
 /*
  * DL_PROMISCOFF_REQ
  */
-static boolean_t
-proto_promiscoff_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp)
 {
-	dl_promiscoff_req_t *dlp = (dl_promiscoff_req_t *)udlp;
+	dl_promiscoff_req_t *dlp = (dl_promiscoff_req_t *)mp->b_rptr;
 	int		err = 0;
 	t_uscalar_t	dl_err;
-	uint32_t	promisc;
+	uint32_t	promisc_saved;
 	queue_t		*q = dsp->ds_wq;
+	mac_perim_handle_t	mph;
 
-	/*
-	 * Because control messages processing is serialized, we don't need
-	 * to hold any lock to read any field of dsp; we hold ds_lock to
-	 * update the ds_promisc field.
-	 */
 	if (MBLKL(mp) < sizeof (dl_promiscoff_req_t)) {
 		dl_err = DL_BADPRIM;
 		goto failed;
@@ -692,60 +664,66 @@ proto_promiscoff_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 		goto failed;
 	}
 
+	promisc_saved = dsp->ds_promisc;
 	switch (dlp->dl_level) {
 	case DL_PROMISC_SAP:
-		promisc = DLS_PROMISC_SAP;
+		if (!(dsp->ds_promisc & DLS_PROMISC_SAP)) {
+			dl_err = DL_NOTENAB;
+			goto failed;
+		}
+		dsp->ds_promisc &= ~DLS_PROMISC_SAP;
 		break;
+
 	case DL_PROMISC_MULTI:
-		promisc = DLS_PROMISC_MULTI;
+		if (!(dsp->ds_promisc & DLS_PROMISC_MULTI)) {
+			dl_err = DL_NOTENAB;
+			goto failed;
+		}
+		dsp->ds_promisc &= ~DLS_PROMISC_MULTI;
 		break;
+
 	case DL_PROMISC_PHYS:
-		promisc = DLS_PROMISC_PHYS;
+		if (!(dsp->ds_promisc & DLS_PROMISC_PHYS)) {
+			dl_err = DL_NOTENAB;
+			goto failed;
+		}
+		dsp->ds_promisc &= ~DLS_PROMISC_PHYS;
 		break;
+
 	default:
 		dl_err = DL_NOTSUPPORTED;
 		goto failed;
 	}
 
-	if (!(dsp->ds_promisc & promisc)) {
-		dl_err = DL_NOTENAB;
-		goto failed;
-	}
+	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+	/*
+	 * Adjust channel promiscuity.
+	 */
+	err = dls_promisc(dsp, promisc_saved);
+	mac_perim_exit(mph);
 
-	promisc = (dsp->ds_promisc & ~promisc);
-	err = dls_promisc(dsp->ds_dc, promisc);
 	if (err != 0) {
 		dl_err = DL_SYSERR;
 		goto failed;
 	}
-
-	rw_enter(&dsp->ds_lock, RW_WRITER);
-	dsp->ds_promisc = promisc;
-	rw_exit(&dsp->ds_lock);
-
 	dlokack(q, mp, DL_PROMISCOFF_REQ);
-	return (B_TRUE);
+	return;
 failed:
 	dlerrorack(q, mp, DL_PROMISCOFF_REQ, dl_err, (t_uscalar_t)err);
-	return (B_FALSE);
 }
 
 /*
  * DL_ENABMULTI_REQ
  */
-static boolean_t
-proto_enabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_enabmulti_req(dld_str_t *dsp, mblk_t *mp)
 {
-	dl_enabmulti_req_t *dlp = (dl_enabmulti_req_t *)udlp;
+	dl_enabmulti_req_t *dlp = (dl_enabmulti_req_t *)mp->b_rptr;
 	int		err = 0;
 	t_uscalar_t	dl_err;
 	queue_t		*q = dsp->ds_wq;
+	mac_perim_handle_t	mph;
 
-	/*
-	 * Because control messages processing is serialized, we don't need
-	 * to hold any lock to read any field of dsp; we hold ds_lock to
-	 * update the ds_passivestate field.
-	 */
 	if (dsp->ds_dlstate == DL_UNATTACHED ||
 	    DL_ACK_PENDING(dsp->ds_dlstate)) {
 		dl_err = DL_OUTSTATE;
@@ -759,14 +737,16 @@ proto_enabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 		goto failed;
 	}
 
+	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+
 	if (dsp->ds_passivestate == DLD_UNINITIALIZED &&
-	    !dls_active_set(dsp->ds_dc)) {
+	    ((err = dls_active_set(dsp)) != 0)) {
 		dl_err = DL_SYSERR;
-		err = EBUSY;
-		goto failed;
+		goto failed2;
 	}
 
-	err = dls_multicst_add(dsp->ds_dc, mp->b_rptr + dlp->dl_addr_offset);
+	err = dls_multicst_add(dsp, mp->b_rptr + dlp->dl_addr_offset);
+
 	if (err != 0) {
 		switch (err) {
 		case EINVAL:
@@ -781,40 +761,37 @@ proto_enabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 			dl_err = DL_SYSERR;
 			break;
 		}
-
 		if (dsp->ds_passivestate == DLD_UNINITIALIZED)
-			dls_active_clear(dsp->ds_dc);
+			dls_active_clear(dsp);
 
-		goto failed;
+		goto failed2;
 	}
 
-	rw_enter(&dsp->ds_lock, RW_WRITER);
+	mac_perim_exit(mph);
+
 	if (dsp->ds_passivestate == DLD_UNINITIALIZED)
 		dsp->ds_passivestate = DLD_ACTIVE;
-	rw_exit(&dsp->ds_lock);
-
 	dlokack(q, mp, DL_ENABMULTI_REQ);
-	return (B_TRUE);
+	return;
+
+failed2:
+	mac_perim_exit(mph);
 failed:
 	dlerrorack(q, mp, DL_ENABMULTI_REQ, dl_err, (t_uscalar_t)err);
-	return (B_FALSE);
 }
 
 /*
  * DL_DISABMULTI_REQ
  */
-static boolean_t
-proto_disabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_disabmulti_req(dld_str_t *dsp, mblk_t *mp)
 {
-	dl_disabmulti_req_t *dlp = (dl_disabmulti_req_t *)udlp;
+	dl_disabmulti_req_t *dlp = (dl_disabmulti_req_t *)mp->b_rptr;
 	int		err = 0;
 	t_uscalar_t	dl_err;
 	queue_t		*q = dsp->ds_wq;
+	mac_perim_handle_t	mph;
 
-	/*
-	 * Because control messages processing is serialized, we don't need
-	 * to hold any lock to read any field of dsp.
-	 */
 	if (dsp->ds_dlstate == DL_UNATTACHED ||
 	    DL_ACK_PENDING(dsp->ds_dlstate)) {
 		dl_err = DL_OUTSTATE;
@@ -828,45 +805,46 @@ proto_disabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 		goto failed;
 	}
 
-	err = dls_multicst_remove(dsp->ds_dc, mp->b_rptr + dlp->dl_addr_offset);
+	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+	err = dls_multicst_remove(dsp, mp->b_rptr + dlp->dl_addr_offset);
+	mac_perim_exit(mph);
+
 	if (err != 0) {
-		switch (err) {
+	switch (err) {
 		case EINVAL:
 			dl_err = DL_BADADDR;
 			err = 0;
 			break;
+
 		case ENOENT:
 			dl_err = DL_NOTENAB;
 			err = 0;
 			break;
+
 		default:
 			dl_err = DL_SYSERR;
 			break;
 		}
 		goto failed;
 	}
-
 	dlokack(q, mp, DL_DISABMULTI_REQ);
-	return (B_TRUE);
+	return;
 failed:
 	dlerrorack(q, mp, DL_DISABMULTI_REQ, dl_err, (t_uscalar_t)err);
-	return (B_FALSE);
 }
 
 /*
  * DL_PHYS_ADDR_REQ
  */
-static boolean_t
-proto_physaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_physaddr_req(dld_str_t *dsp, mblk_t *mp)
 {
-	dl_phys_addr_req_t *dlp = (dl_phys_addr_req_t *)udlp;
+	dl_phys_addr_req_t *dlp = (dl_phys_addr_req_t *)mp->b_rptr;
 	queue_t		*q = dsp->ds_wq;
 	t_uscalar_t	dl_err;
 	char		*addr;
 	uint_t		addr_length;
 
-	rw_enter(&dsp->ds_lock, RW_READER);
-
 	if (MBLKL(mp) < sizeof (dl_phys_addr_req_t)) {
 		dl_err = DL_BADPRIM;
 		goto failed;
@@ -886,50 +864,34 @@ proto_physaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 
 	addr_length = dsp->ds_mip->mi_addr_length;
 	if (addr_length > 0) {
-		addr = kmem_alloc(addr_length, KM_NOSLEEP);
-		if (addr == NULL) {
-			rw_exit(&dsp->ds_lock);
-			merror(q, mp, ENOSR);
-			return (B_FALSE);
-		}
-
-		/*
-		 * Copy out the address before we drop the lock; we don't
-		 * want to call dlphysaddrack() while holding ds_lock.
-		 */
-		bcopy((dlp->dl_addr_type == DL_CURR_PHYS_ADDR) ?
-		    dsp->ds_curr_addr : dsp->ds_fact_addr, addr, addr_length);
+		addr = kmem_alloc(addr_length, KM_SLEEP);
+		if (dlp->dl_addr_type == DL_CURR_PHYS_ADDR)
+			mac_unicast_primary_get(dsp->ds_mh, (uint8_t *)addr);
+		else
+			bcopy(dsp->ds_mip->mi_unicst_addr, addr, addr_length);
 
-		rw_exit(&dsp->ds_lock);
 		dlphysaddrack(q, mp, addr, (t_uscalar_t)addr_length);
 		kmem_free(addr, addr_length);
 	} else {
-		rw_exit(&dsp->ds_lock);
 		dlphysaddrack(q, mp, NULL, 0);
 	}
-	return (B_TRUE);
+	return;
 failed:
-	rw_exit(&dsp->ds_lock);
 	dlerrorack(q, mp, DL_PHYS_ADDR_REQ, dl_err, 0);
-	return (B_FALSE);
 }
 
 /*
  * DL_SET_PHYS_ADDR_REQ
  */
-static boolean_t
-proto_setphysaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_setphysaddr_req(dld_str_t *dsp, mblk_t *mp)
 {
-	dl_set_phys_addr_req_t *dlp = (dl_set_phys_addr_req_t *)udlp;
+	dl_set_phys_addr_req_t *dlp = (dl_set_phys_addr_req_t *)mp->b_rptr;
 	int		err = 0;
 	t_uscalar_t	dl_err;
 	queue_t		*q = dsp->ds_wq;
+	mac_perim_handle_t	mph;
 
-	/*
-	 * Because control message processing is serialized, we don't need
-	 * to hold any locks to read any fields of dsp; we only need ds_lock
-	 * to update the ds_passivestate field.
-	 */
 	if (dsp->ds_dlstate == DL_UNATTACHED ||
 	    DL_ACK_PENDING(dsp->ds_dlstate)) {
 		dl_err = DL_OUTSTATE;
@@ -943,14 +905,16 @@ proto_setphysaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 		goto failed;
 	}
 
+	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+
 	if (dsp->ds_passivestate == DLD_UNINITIALIZED &&
-	    !dls_active_set(dsp->ds_dc)) {
+	    ((err = dls_active_set(dsp)) != 0)) {
 		dl_err = DL_SYSERR;
-		err = EBUSY;
-		goto failed;
+		goto failed2;
 	}
 
-	err = mac_unicst_set(dsp->ds_mh, mp->b_rptr + dlp->dl_addr_offset);
+	err = mac_unicast_primary_set(dsp->ds_mh,
+	    mp->b_rptr + dlp->dl_addr_offset);
 	if (err != 0) {
 		switch (err) {
 		case EINVAL:
@@ -962,32 +926,33 @@ proto_setphysaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 			dl_err = DL_SYSERR;
 			break;
 		}
-
 		if (dsp->ds_passivestate == DLD_UNINITIALIZED)
-			dls_active_clear(dsp->ds_dc);
+			dls_active_clear(dsp);
+
+		goto failed2;
 
-		goto failed;
 	}
 
-	rw_enter(&dsp->ds_lock, RW_WRITER);
+	mac_perim_exit(mph);
+
 	if (dsp->ds_passivestate == DLD_UNINITIALIZED)
 		dsp->ds_passivestate = DLD_ACTIVE;
-	rw_exit(&dsp->ds_lock);
-
 	dlokack(q, mp, DL_SET_PHYS_ADDR_REQ);
-	return (B_TRUE);
+	return;
+
+failed2:
+	mac_perim_exit(mph);
 failed:
 	dlerrorack(q, mp, DL_SET_PHYS_ADDR_REQ, dl_err, (t_uscalar_t)err);
-	return (B_FALSE);
 }
 
 /*
  * DL_UDQOS_REQ
  */
-static boolean_t
-proto_udqos_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_udqos_req(dld_str_t *dsp, mblk_t *mp)
 {
-	dl_udqos_req_t *dlp = (dl_udqos_req_t *)udlp;
+	dl_udqos_req_t *dlp = (dl_udqos_req_t *)mp->b_rptr;
 	dl_qos_cl_sel1_t *selp;
 	int		off, len;
 	t_uscalar_t	dl_err;
@@ -1013,21 +978,11 @@ proto_udqos_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 		goto failed;
 	}
 
-	if (dsp->ds_dlstate == DL_UNATTACHED ||
-	    DL_ACK_PENDING(dsp->ds_dlstate)) {
-		dl_err = DL_OUTSTATE;
-		goto failed;
-	}
-
-	rw_enter(&dsp->ds_lock, RW_WRITER);
 	dsp->ds_pri = selp->dl_priority;
-	rw_exit(&dsp->ds_lock);
-
 	dlokack(q, mp, DL_UDQOS_REQ);
-	return (B_TRUE);
+	return;
 failed:
 	dlerrorack(q, mp, DL_UDQOS_REQ, dl_err, 0);
-	return (B_FALSE);
 }
 
 static boolean_t
@@ -1047,19 +1002,16 @@ check_ip_above(queue_t *q)
 /*
  * DL_CAPABILITY_REQ
  */
-/*ARGSUSED*/
-static boolean_t
-proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_capability_req(dld_str_t *dsp, mblk_t *mp)
 {
-	dl_capability_req_t *dlp = (dl_capability_req_t *)udlp;
+	dl_capability_req_t *dlp = (dl_capability_req_t *)mp->b_rptr;
 	dl_capability_sub_t *sp;
 	size_t		size, len;
 	offset_t	off, end;
 	t_uscalar_t	dl_err;
 	queue_t		*q = dsp->ds_wq;
 
-	rw_enter(&dsp->ds_lock, RW_WRITER);
-
 	if (MBLKL(mp) < sizeof (dl_capability_req_t)) {
 		dl_err = DL_BADPRIM;
 		goto failed;
@@ -1077,8 +1029,8 @@ proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 	 * support. Otherwise we enable the set of capabilities requested.
 	 */
 	if (dlp->dl_sub_length == 0) {
-		/* callee drops lock */
-		return (proto_capability_advertise(dsp, mp));
+		proto_capability_advertise(dsp, mp);
+		return;
 	}
 
 	if (!MBLKIN(mp, dlp->dl_sub_offset, dlp->dl_sub_length)) {
@@ -1122,137 +1074,37 @@ proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 			break;
 		}
 
-		/*
-		 * Large segment offload. (LSO)
-		 */
-		case DL_CAPAB_LSO: {
-			dl_capab_lso_t *lsop;
-			dl_capab_lso_t lso;
-
-			lsop = (dl_capab_lso_t *)&sp[1];
-			/*
-			 * Copy for alignment.
-			 */
-			bcopy(lsop, &lso, sizeof (dl_capab_lso_t));
-			dlcapabsetqid(&(lso.lso_mid), dsp->ds_rq);
-			bcopy(&lso, lsop, sizeof (dl_capab_lso_t));
-			break;
-		}
-
-		/*
-		 * IP polling interface.
-		 */
-		case DL_CAPAB_POLL: {
-			dl_capab_dls_t *pollp;
-			dl_capab_dls_t	poll;
-
-			pollp = (dl_capab_dls_t *)&sp[1];
-			/*
-			 * Copy for alignment.
-			 */
-			bcopy(pollp, &poll, sizeof (dl_capab_dls_t));
-
-			switch (poll.dls_flags) {
-			default:
-				/*FALLTHRU*/
-			case POLL_DISABLE:
-				proto_poll_disable(dsp);
-				break;
-
-			case POLL_ENABLE:
-				ASSERT(!(dld_opt & DLD_OPT_NO_POLL));
-
-				/*
-				 * Make sure polling is disabled.
-				 */
-				proto_poll_disable(dsp);
-
-				/*
-				 * Note that only IP should enable POLL.
-				 */
-				if (check_ip_above(dsp->ds_rq) &&
-				    proto_poll_enable(dsp, &poll)) {
-					bzero(&poll, sizeof (dl_capab_dls_t));
-					poll.dls_flags = POLL_ENABLE;
-				} else {
-					bzero(&poll, sizeof (dl_capab_dls_t));
-					poll.dls_flags = POLL_DISABLE;
-				}
-				break;
-			}
-
-			dlcapabsetqid(&(poll.dls_mid), dsp->ds_rq);
-			bcopy(&poll, pollp, sizeof (dl_capab_dls_t));
-			break;
-		}
-		case DL_CAPAB_SOFT_RING: {
-			dl_capab_dls_t *soft_ringp;
-			dl_capab_dls_t soft_ring;
+		case DL_CAPAB_DLD: {
+			dl_capab_dld_t	*dldp;
+			dl_capab_dld_t	dld;
 
-			soft_ringp = (dl_capab_dls_t *)&sp[1];
+			dldp = (dl_capab_dld_t *)&sp[1];
 			/*
 			 * Copy for alignment.
 			 */
-			bcopy(soft_ringp, &soft_ring,
-			    sizeof (dl_capab_dls_t));
-
-			switch (soft_ring.dls_flags) {
-			default:
-				/*FALLTHRU*/
-			case SOFT_RING_DISABLE:
-				proto_soft_ring_disable(dsp);
-				break;
-
-			case SOFT_RING_ENABLE:
-				ASSERT(!(dld_opt & DLD_OPT_NO_SOFTRING));
-				/*
-				 * Make sure soft_ring is disabled.
-				 */
-				proto_soft_ring_disable(dsp);
-
-				/*
-				 * Note that only IP can enable soft ring.
-				 */
-				if (check_ip_above(dsp->ds_rq) &&
-				    proto_soft_ring_enable(dsp, &soft_ring)) {
-					bzero(&soft_ring,
-					    sizeof (dl_capab_dls_t));
-					soft_ring.dls_flags = SOFT_RING_ENABLE;
-				} else {
-					bzero(&soft_ring,
-					    sizeof (dl_capab_dls_t));
-					soft_ring.dls_flags = SOFT_RING_DISABLE;
-				}
-				break;
-			}
-
-			dlcapabsetqid(&(soft_ring.dls_mid), dsp->ds_rq);
-			bcopy(&soft_ring, soft_ringp,
-			    sizeof (dl_capab_dls_t));
+			bcopy(dldp, &dld, sizeof (dl_capab_dld_t));
+			dlcapabsetqid(&(dld.dld_mid), dsp->ds_rq);
+			bcopy(&dld, dldp, sizeof (dl_capab_dld_t));
 			break;
 		}
 		default:
 			break;
 		}
-
 		off += size;
 	}
-	rw_exit(&dsp->ds_lock);
 	qreply(q, mp);
-	return (B_TRUE);
+	return;
 failed:
-	rw_exit(&dsp->ds_lock);
 	dlerrorack(q, mp, DL_CAPABILITY_REQ, dl_err, 0);
-	return (B_FALSE);
 }
 
 /*
  * DL_NOTIFY_REQ
  */
-static boolean_t
-proto_notify_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_notify_req(dld_str_t *dsp, mblk_t *mp)
 {
-	dl_notify_req_t	*dlp = (dl_notify_req_t *)udlp;
+	dl_notify_req_t	*dlp = (dl_notify_req_t *)mp->b_rptr;
 	t_uscalar_t	dl_err;
 	queue_t		*q = dsp->ds_wq;
 	uint_t		note =
@@ -1264,8 +1116,6 @@ proto_notify_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 	    DL_NOTE_CAPAB_RENEG |
 	    DL_NOTE_SPEED;
 
-	rw_enter(&dsp->ds_lock, RW_WRITER);
-
 	if (MBLKL(mp) < sizeof (dl_notify_req_t)) {
 		dl_err = DL_BADPRIM;
 		goto failed;
@@ -1283,7 +1133,6 @@ proto_notify_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 	 * Cache the notifications that are being enabled.
 	 */
 	dsp->ds_notifications = dlp->dl_notifications & note;
-	rw_exit(&dsp->ds_lock);
 	/*
 	 * The ACK carries all notifications regardless of which set is
 	 * being enabled.
@@ -1291,27 +1140,21 @@ proto_notify_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 	dlnotifyack(q, mp, note);
 
 	/*
-	 * Solicit DL_NOTIFY_IND messages for each enabled notification.
+	 * Generate DL_NOTIFY_IND messages for each enabled notification.
 	 */
-	rw_enter(&dsp->ds_lock, RW_READER);
 	if (dsp->ds_notifications != 0) {
-		rw_exit(&dsp->ds_lock);
 		dld_str_notify_ind(dsp);
-	} else {
-		rw_exit(&dsp->ds_lock);
 	}
-	return (B_TRUE);
+	return;
 failed:
-	rw_exit(&dsp->ds_lock);
 	dlerrorack(q, mp, DL_NOTIFY_REQ, dl_err, 0);
-	return (B_FALSE);
 }
 
 /*
- * DL_UNITDATA_REQ
+ * DL_UINTDATA_REQ
  */
 void
-dld_wput_proto_data(dld_str_t *dsp, mblk_t *mp)
+proto_unitdata_req(dld_str_t *dsp, mblk_t *mp)
 {
 	queue_t			*q = dsp->ds_wq;
 	dl_unitdata_req_t	*dlp = (dl_unitdata_req_t *)mp->b_rptr;
@@ -1326,10 +1169,19 @@ dld_wput_proto_data(dld_str_t *dsp, mblk_t *mp)
 	uint_t			max_sdu;
 
 	if (MBLKL(mp) < sizeof (dl_unitdata_req_t) || mp->b_cont == NULL) {
-		dl_err = DL_BADPRIM;
-		goto failed;
+		dlerrorack(q, mp, DL_UNITDATA_REQ, DL_BADPRIM, 0);
+		return;
 	}
 
+	mutex_enter(&dsp->ds_lock);
+	if (dsp->ds_dlstate != DL_IDLE) {
+		mutex_exit(&dsp->ds_lock);
+		dlerrorack(q, mp, DL_UNITDATA_REQ, DL_OUTSTATE, 0);
+		return;
+	}
+	DLD_DATATHR_INC(dsp);
+	mutex_exit(&dsp->ds_lock);
+
 	addr_length = dsp->ds_mip->mi_addr_length;
 
 	off = dlp->dl_dest_addr_offset;
@@ -1367,7 +1219,7 @@ dld_wput_proto_data(dld_str_t *dsp, mblk_t *mp)
 	/*
 	 * Build a packet header.
 	 */
-	if ((bp = dls_header(dsp->ds_dc, addr, sap, dlp->dl_priority.dl_max,
+	if ((bp = dls_header(dsp, addr, sap, dlp->dl_priority.dl_max,
 	    &payload)) == NULL) {
 		dl_err = DL_BADADDR;
 		goto failed;
@@ -1390,32 +1242,37 @@ dld_wput_proto_data(dld_str_t *dsp, mblk_t *mp)
 	 */
 	ASSERT(bp->b_cont == NULL);
 	bp->b_cont = payload;
-	dld_tx_single(dsp, bp);
+
+	/*
+	 * No lock can be held across modules and putnext()'s,
+	 * which can happen here with the call from DLD_TX().
+	 */
+	if (DLD_TX(dsp, bp, 0, 0) != NULL) {
+		/* flow-controlled */
+		DLD_SETQFULL(dsp);
+	}
+	DLD_DATATHR_DCR(dsp);
 	return;
+
 failed:
 	dlerrorack(q, mp, DL_UNITDATA_REQ, dl_err, 0);
+	DLD_DATATHR_DCR(dsp);
 	return;
 
 baddata:
 	dluderrorind(q, mp, (void *)addr, len, DL_BADDATA, 0);
+	DLD_DATATHR_DCR(dsp);
 }
 
 /*
  * DL_PASSIVE_REQ
  */
-/* ARGSUSED */
-static boolean_t
-proto_passive_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
+static void
+proto_passive_req(dld_str_t *dsp, mblk_t *mp)
 {
 	t_uscalar_t dl_err;
 
 	/*
-	 * READER lock is enough because ds_passivestate can only be changed
-	 * as the result of non-data message processing.
-	 */
-	rw_enter(&dsp->ds_lock, RW_READER);
-
-	/*
 	 * If we've already become active by issuing an active primitive,
 	 * then it's too late to try to become passive.
 	 */
@@ -1430,209 +1287,281 @@ proto_passive_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
 	}
 
 	dsp->ds_passivestate = DLD_PASSIVE;
-	rw_exit(&dsp->ds_lock);
 	dlokack(dsp->ds_wq, mp, DL_PASSIVE_REQ);
-	return (B_TRUE);
+	return;
 failed:
-	rw_exit(&dsp->ds_lock);
 	dlerrorack(dsp->ds_wq, mp, DL_PASSIVE_REQ, dl_err, 0);
-	return (B_FALSE);
 }
 
+
 /*
  * Catch-all handler.
  */
-static boolean_t
-proto_req(dld_str_t *dsp, union DL_primitives *dlp, mblk_t *mp)
+static void
+proto_req(dld_str_t *dsp, mblk_t *mp)
 {
+	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
+
 	dlerrorack(dsp->ds_wq, mp, dlp->dl_primitive, DL_UNSUPPORTED, 0);
-	return (B_FALSE);
 }
 
-static void
-proto_poll_disable(dld_str_t *dsp)
+static int
+dld_capab_perim(dld_str_t *dsp, void *data, uint_t flags)
 {
-	mac_handle_t	mh;
+	switch (flags) {
+	case DLD_ENABLE:
+		mac_perim_enter_by_mh(dsp->ds_mh, (mac_perim_handle_t *)data);
+		return (0);
 
-	ASSERT(RW_WRITE_HELD(&dsp->ds_lock));
+	case DLD_DISABLE:
+		mac_perim_exit((mac_perim_handle_t)data);
+		return (0);
 
-	if (!dsp->ds_polling)
-		return;
+	case DLD_QUERY:
+		return (mac_perim_held(dsp->ds_mh));
+	}
+	return (0);
+}
 
-	/*
-	 * It should be impossible to enable raw mode if polling is turned on.
-	 */
-	ASSERT(dsp->ds_mode != DLD_RAW);
+static int
+dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags)
+{
+	dld_capab_direct_t	*direct = data;
 
-	/*
-	 * Reset the resource_add callback.
-	 */
-	mh = dls_mac(dsp->ds_dc);
-	mac_resource_set(mh, NULL, NULL);
-	mac_resources(mh);
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
 
-	/*
-	 * Set receive function back to default.
-	 */
-	dls_rx_set(dsp->ds_dc, (dsp->ds_mode == DLD_FASTPATH) ?
-	    dld_str_rx_fastpath : dld_str_rx_unitdata, dsp);
+	switch (flags) {
+	case DLD_ENABLE:
+		dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf,
+		    direct->di_rx_ch);
+		/*
+		 * TODO: XXXGopi
+		 *
+		 * Direct pointer to functions in the MAC layer
+		 * should be passed here:
+		 *
+		 * 1) pass mac_tx() and mac_client_handle instead
+		 * of str_mdata_fastpath_put() and dld_str_t. But
+		 * not done presently because of some VLAN
+		 * processing stuff in str_mdata_fastpath_put().
+		 *
+		 * 2) pass a MAC layer callback instead of
+		 * dld_flow_ctl_callb().
+		 */
+		direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
+		direct->di_tx_dh = dsp;
 
-	/*
-	 * Note that polling is disabled.
-	 */
-	dsp->ds_polling = B_FALSE;
+		direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify;
+		direct->di_tx_cb_dh = dsp->ds_mch;
+		dsp->ds_direct = B_TRUE;
+
+		return (0);
+
+	case DLD_DISABLE:
+		dls_rx_set(dsp, (dsp->ds_mode == DLD_FASTPATH) ?
+		    dld_str_rx_fastpath : dld_str_rx_unitdata, (void *)dsp);
+		dsp->ds_direct = B_FALSE;
+
+		return (0);
+	}
+	return (ENOTSUP);
 }
 
-static boolean_t
-proto_poll_enable(dld_str_t *dsp, dl_capab_dls_t *pollp)
+/*
+ * dld_capab_poll_enable()
+ *
+ * This function is misnamed. All polling  and fanouts are run out of the
+ * lower mac (in case of VNIC and the only mac in case of NICs). The
+ * availability of Rx ring and promiscous mode is all taken care between
+ * the soft ring set (mac_srs), the Rx ring, and S/W classifier. Any
+ * fanout necessary is done by the soft rings that are part of the
+ * mac_srs (by default mac_srs sends the packets up via a TCP and
+ * non TCP soft ring).
+ *
+ * The mac_srs (or its associated soft rings) always store the ill_rx_ring
+ * (the cookie returned when they registered with IP during plumb) as their
+ * 2nd argument which is passed up as mac_resource_handle_t. The upcall
+ * function and 1st argument is what the caller registered when they
+ * called mac_rx_classify_flow_add() to register the flow. For VNIC,
+ * the function is vnic_rx and argument is vnic_t. For regular NIC
+ * case, it mac_rx_default and mac_handle_t. As explained above, the
+ * mac_srs (or its soft ring) will add the ill_rx_ring (mac_resource_handle_t)
+ * from its stored 2nd argument.
+ */
+static int
+dld_capab_poll_enable(dld_str_t *dsp, dld_capab_poll_t *poll)
 {
-	mac_handle_t	mh;
+	if (dsp->ds_polling)
+		return (EINVAL);
 
-	ASSERT(RW_WRITE_HELD(&dsp->ds_lock));
-	ASSERT(!dsp->ds_polling);
+	if ((dld_opt & DLD_OPT_NO_POLL) != 0 || dsp->ds_mode == DLD_RAW)
+		return (ENOTSUP);
 
 	/*
-	 * We cannot enable polling if raw mode
-	 * has been enabled.
+	 * Enable client polling if and only if DLS bypass is possible.
+	 * Special cases like VLANs need DLS processing in the Rx data path.
+	 * In such a case we can neither allow the client (IP) to directly
+	 * poll the softring (since DLS processing hasn't been done) nor can
+	 * we allow DLS bypass.
 	 */
-	if (dsp->ds_mode == DLD_RAW)
-		return (B_FALSE);
-
-	mh = dls_mac(dsp->ds_dc);
+	if (!mac_rx_bypass_set(dsp->ds_mch, dsp->ds_rx, dsp->ds_rx_arg))
+		return (ENOTSUP);
 
 	/*
-	 * Register resources.
+	 * Register soft ring resources. This will come in handy later if
+	 * the user decides to modify CPU bindings to use more CPUs for the
+	 * device in which case we will switch to fanout using soft rings.
 	 */
-	mac_resource_set(mh, (mac_resource_add_t)pollp->dls_ring_add,
-	    (void *)pollp->dls_rx_handle);
-
-	mac_resources(mh);
+	mac_resource_set_common(dsp->ds_mch,
+	    (mac_resource_add_t)poll->poll_ring_add_cf,
+	    (mac_resource_remove_t)poll->poll_ring_remove_cf,
+	    (mac_resource_quiesce_t)poll->poll_ring_quiesce_cf,
+	    (mac_resource_restart_t)poll->poll_ring_restart_cf,
+	    (mac_resource_bind_t)poll->poll_ring_bind_cf,
+	    poll->poll_ring_ch);
 
-	/*
-	 * Set the upstream receive function.
-	 */
-	dls_rx_set(dsp->ds_dc, (dls_rx_t)pollp->dls_rx,
-	    (void *)pollp->dls_rx_handle);
+	mac_client_poll_enable(dsp->ds_mch);
 
-	/*
-	 * Note that polling is enabled. This prevents further DLIOCHDRINFO
-	 * ioctls from overwriting the receive function pointer.
-	 */
 	dsp->ds_polling = B_TRUE;
-	return (B_TRUE);
+	return (0);
 }
 
-static void
-proto_soft_ring_disable(dld_str_t *dsp)
+/* ARGSUSED */
+static int
+dld_capab_poll_disable(dld_str_t *dsp, dld_capab_poll_t *poll)
 {
-	ASSERT(RW_WRITE_HELD(&dsp->ds_lock));
+	if (!dsp->ds_polling)
+		return (EINVAL);
 
-	if (!dsp->ds_soft_ring)
-		return;
+	mac_client_poll_disable(dsp->ds_mch);
+	mac_resource_set(dsp->ds_mch, NULL, NULL);
 
-	/*
-	 * It should be impossible to enable raw mode if soft_ring is turned on.
-	 */
-	ASSERT(dsp->ds_mode != DLD_RAW);
-	proto_change_soft_ring_fanout(dsp, SOFT_RING_NONE);
-	/*
-	 * Note that fanout is disabled.
-	 */
-	dsp->ds_soft_ring = B_FALSE;
+	dsp->ds_polling = B_FALSE;
+	return (0);
 }
 
-static boolean_t
-proto_soft_ring_enable(dld_str_t *dsp, dl_capab_dls_t *soft_ringp)
+static int
+dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags)
 {
-	ASSERT(RW_WRITE_HELD(&dsp->ds_lock));
-	ASSERT(!dsp->ds_soft_ring);
+	dld_capab_poll_t	*poll = data;
 
-	/*
-	 * We cannot enable soft_ring if raw mode
-	 * has been enabled.
-	 */
-	if (dsp->ds_mode == DLD_RAW)
-		return (B_FALSE);
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
 
-	if (dls_soft_ring_enable(dsp->ds_dc, soft_ringp) == B_FALSE)
-		return (B_FALSE);
+	switch (flags) {
+	case DLD_ENABLE:
+		return (dld_capab_poll_enable(dsp, poll));
+	case DLD_DISABLE:
+		return (dld_capab_poll_disable(dsp, poll));
+	}
+	return (ENOTSUP);
+}
 
-	dsp->ds_soft_ring = B_TRUE;
-	return (B_TRUE);
+static int
+dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags)
+{
+	dld_capab_lso_t		*lso = data;
+
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
+	switch (flags) {
+	case DLD_ENABLE: {
+		mac_capab_lso_t		mac_lso;
+
+		/*
+		 * Check if LSO is supported on this MAC & enable LSO
+		 * accordingly.
+		 */
+		if (mac_capab_get(dsp->ds_mh, MAC_CAPAB_LSO, &mac_lso)) {
+			lso->lso_max = mac_lso.lso_basic_tcp_ipv4.lso_max;
+			lso->lso_flags = 0;
+			/* translate the flag for mac clients */
+			if ((mac_lso.lso_flags & LSO_TX_BASIC_TCP_IPV4) != 0)
+				lso->lso_flags |= DLD_LSO_TX_BASIC_TCP_IPV4;
+			dsp->ds_lso = B_TRUE;
+			dsp->ds_lso_max = lso->lso_max;
+		} else {
+			dsp->ds_lso = B_FALSE;
+			dsp->ds_lso_max = 0;
+			return (ENOTSUP);
+		}
+		return (0);
+	}
+	case DLD_DISABLE: {
+		dsp->ds_lso = B_FALSE;
+		dsp->ds_lso_max = 0;
+		return (0);
+	}
+	}
+	return (ENOTSUP);
 }
 
-static void
-proto_change_soft_ring_fanout(dld_str_t *dsp, int type)
+static int
+dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags)
 {
-	dls_channel_t	dc = dsp->ds_dc;
+	int	err;
 
-	if (type == SOFT_RING_NONE) {
-		dls_rx_set(dc, (dsp->ds_mode == DLD_FASTPATH) ?
-		    dld_str_rx_fastpath : dld_str_rx_unitdata, dsp);
-	} else if (type != SOFT_RING_NONE) {
-		dls_rx_set(dc, (dls_rx_t)dls_soft_ring_fanout, dc);
+	/*
+	 * Don't enable direct callback capabilities unless the caller is
+	 * the IP client. When a module is inserted in a stream (_I_INSERT)
+	 * the stack initiates capability disable, but due to races, the
+	 * module insertion may complete before the capability disable
+	 * completes. So we limit the check to DLD_ENABLE case.
+	 */
+	if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) &&
+	    (dsp->ds_sap != ETHERTYPE_IP || !check_ip_above(dsp->ds_rq))) {
+		return (ENOTSUP);
 	}
+
+	switch (type) {
+	case DLD_CAPAB_DIRECT:
+		err = dld_capab_direct(dsp, data, flags);
+		break;
+
+	case DLD_CAPAB_POLL:
+		err =  dld_capab_poll(dsp, data, flags);
+		break;
+
+	case DLD_CAPAB_PERIM:
+		err = dld_capab_perim(dsp, data, flags);
+		break;
+
+	case DLD_CAPAB_LSO:
+		err = dld_capab_lso(dsp, data, flags);
+		break;
+
+	default:
+		err = ENOTSUP;
+		break;
+	}
+
+	return (err);
 }
 
 /*
  * DL_CAPABILITY_ACK/DL_ERROR_ACK
  */
-static boolean_t
+static void
 proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
 {
 	dl_capability_ack_t	*dlap;
 	dl_capability_sub_t	*dlsp;
 	size_t			subsize;
-	dl_capab_dls_t		poll;
-	dl_capab_dls_t		soft_ring;
+	dl_capab_dld_t		dld;
 	dl_capab_hcksum_t	hcksum;
-	dl_capab_lso_t		lso;
 	dl_capab_zerocopy_t	zcopy;
 	uint8_t			*ptr;
 	queue_t			*q = dsp->ds_wq;
 	mblk_t			*mp1;
-	boolean_t		is_vlan = (dsp->ds_vid != VLAN_ID_NONE);
-	boolean_t		poll_capable = B_FALSE;
-	boolean_t		soft_ring_capable = B_FALSE;
+	boolean_t		is_vlan;
 	boolean_t		hcksum_capable = B_FALSE;
 	boolean_t		zcopy_capable = B_FALSE;
-	boolean_t		lso_capable = B_FALSE;
-	mac_capab_lso_t		mac_lso;
-
-	ASSERT(RW_WRITE_HELD(&dsp->ds_lock));
+	boolean_t		dld_capable = B_FALSE;
 
 	/*
 	 * Initially assume no capabilities.
 	 */
 	subsize = 0;
-
-	/*
-	 * Check if soft ring can be enabled on this interface. Note that we
-	 * do not enable softring on any legacy drivers, because doing that
-	 * would hurt the performance if the legacy driver has its own taskq
-	 * implementation. Further, most high-performance legacy drivers do
-	 * have their own taskq implementation.
-	 *
-	 * If advertising DL_CAPAB_SOFT_RING has not been explicitly disabled,
-	 * reserve space for that capability.
-	 */
-	if (!mac_is_legacy(dsp->ds_mh) && !(dld_opt & DLD_OPT_NO_SOFTRING)) {
-		soft_ring_capable = B_TRUE;
-		subsize += sizeof (dl_capability_sub_t) +
-		    sizeof (dl_capab_dls_t);
-	}
-
-	/*
-	 * Check if polling can be enabled on this interface.
-	 * If advertising DL_CAPAB_POLL has not been explicitly disabled
-	 * then reserve space for that capability.
-	 */
-	if (mac_capab_get(dsp->ds_mh, MAC_CAPAB_POLL, NULL) &&
-	    !(dld_opt & DLD_OPT_NO_POLL) && !is_vlan) {
-		poll_capable = B_TRUE;
-		subsize += sizeof (dl_capability_sub_t) +
-		    sizeof (dl_capab_dls_t);
-	}
+	is_vlan = (mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE);
 
 	/*
 	 * Check if checksum offload is supported on this MAC.  Don't
@@ -1652,16 +1581,6 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
 	}
 
 	/*
-	 * Check if LSO is supported on this MAC, then reserve space for
-	 * the DL_CAPAB_LSO capability.
-	 */
-	if (mac_capab_get(dsp->ds_mh, MAC_CAPAB_LSO, &mac_lso)) {
-		lso_capable = B_TRUE;
-		subsize += sizeof (dl_capability_sub_t) +
-		    sizeof (dl_capab_lso_t);
-	}
-
-	/*
 	 * Check if zerocopy is supported on this interface.
 	 * If advertising DL_CAPAB_ZEROCOPY has not been explicitly disabled
 	 * then reserve space for that capability.
@@ -1674,14 +1593,22 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
 	}
 
 	/*
+	 * Direct capability negotiation interface between IP and DLD
+	 */
+	if (dsp->ds_sap == ETHERTYPE_IP && check_ip_above(dsp->ds_rq)) {
+		dld_capable = B_TRUE;
+		subsize += sizeof (dl_capability_sub_t) +
+		    sizeof (dl_capab_dld_t);
+	}
+
+	/*
 	 * If there are no capabilities to advertise or if we
 	 * can't allocate a response, send a DL_ERROR_ACK.
 	 */
 	if ((mp1 = reallocb(mp,
 	    sizeof (dl_capability_ack_t) + subsize, 0)) == NULL) {
-		rw_exit(&dsp->ds_lock);
 		dlerrorack(q, mp, DL_CAPABILITY_REQ, DL_NOTSUPPORTED, 0);
-		return (B_FALSE);
+		return;
 	}
 
 	mp = mp1;
@@ -1695,56 +1622,6 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
 	ptr = (uint8_t *)&dlap[1];
 
 	/*
-	 * IP polling interface.
-	 */
-	if (poll_capable) {
-		/*
-		 * Attempt to disable just in case this is a re-negotiation;
-		 * READER lock is enough because ds_polling can only be
-		 * changed as the result of non-data message processing.
-		 */
-		proto_poll_disable(dsp);
-
-		dlsp = (dl_capability_sub_t *)ptr;
-
-		dlsp->dl_cap = DL_CAPAB_POLL;
-		dlsp->dl_length = sizeof (dl_capab_dls_t);
-		ptr += sizeof (dl_capability_sub_t);
-
-		bzero(&poll, sizeof (dl_capab_dls_t));
-		poll.dls_version = POLL_VERSION_1;
-		poll.dls_flags = POLL_CAPABLE;
-		poll.dls_tx_handle = (uintptr_t)dsp;
-		poll.dls_tx = (uintptr_t)str_mdata_fastpath_put;
-		dlcapabsetqid(&(poll.dls_mid), dsp->ds_rq);
-		bcopy(&poll, ptr, sizeof (dl_capab_dls_t));
-		ptr += sizeof (dl_capab_dls_t);
-	}
-
-
-	if (soft_ring_capable) {
-		dlsp = (dl_capability_sub_t *)ptr;
-
-		dlsp->dl_cap = DL_CAPAB_SOFT_RING;
-		dlsp->dl_length = sizeof (dl_capab_dls_t);
-		ptr += sizeof (dl_capability_sub_t);
-
-		bzero(&soft_ring, sizeof (dl_capab_dls_t));
-		soft_ring.dls_version = SOFT_RING_VERSION_1;
-		soft_ring.dls_flags = SOFT_RING_CAPABLE;
-		soft_ring.dls_tx_handle = (uintptr_t)dsp;
-		soft_ring.dls_tx = (uintptr_t)str_mdata_fastpath_put;
-		soft_ring.dls_ring_change_status =
-		    (uintptr_t)proto_change_soft_ring_fanout;
-		soft_ring.dls_ring_bind = (uintptr_t)soft_ring_bind;
-		soft_ring.dls_ring_unbind = (uintptr_t)soft_ring_unbind;
-
-		dlcapabsetqid(&(soft_ring.dls_mid), dsp->ds_rq);
-		bcopy(&soft_ring, ptr, sizeof (dl_capab_dls_t));
-		ptr += sizeof (dl_capab_dls_t);
-	}
-
-	/*
 	 * TCP/IP checksum offload.
 	 */
 	if (hcksum_capable) {
@@ -1761,32 +1638,6 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
 	}
 
 	/*
-	 * Large segment offload. (LSO)
-	 */
-	if (lso_capable) {
-		dlsp = (dl_capability_sub_t *)ptr;
-
-		dlsp->dl_cap = DL_CAPAB_LSO;
-		dlsp->dl_length = sizeof (dl_capab_lso_t);
-		ptr += sizeof (dl_capability_sub_t);
-
-		lso.lso_version = LSO_VERSION_1;
-		lso.lso_flags = mac_lso.lso_flags;
-		lso.lso_max = mac_lso.lso_basic_tcp_ipv4.lso_max;
-
-		/* Simply enable LSO with DLD */
-		dsp->ds_lso = B_TRUE;
-		dsp->ds_lso_max = lso.lso_max;
-
-		dlcapabsetqid(&(lso.lso_mid), dsp->ds_rq);
-		bcopy(&lso, ptr, sizeof (dl_capab_lso_t));
-		ptr += sizeof (dl_capab_lso_t);
-	} else {
-		dsp->ds_lso = B_FALSE;
-		dsp->ds_lso_max = 0;
-	}
-
-	/*
 	 * Zero copy
 	 */
 	if (zcopy_capable) {
@@ -1805,11 +1656,28 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
 		ptr += sizeof (dl_capab_zerocopy_t);
 	}
 
-	ASSERT(ptr == mp->b_rptr + sizeof (dl_capability_ack_t) + subsize);
+	/*
+	 * Direct capability negotiation interface between IP and DLD.
+	 * Refer to dld.h for details.
+	 */
+	if (dld_capable) {
+		dlsp = (dl_capability_sub_t *)ptr;
+		dlsp->dl_cap = DL_CAPAB_DLD;
+		dlsp->dl_length = sizeof (dl_capab_dld_t);
+		ptr += sizeof (dl_capability_sub_t);
 
-	rw_exit(&dsp->ds_lock);
+		bzero(&dld, sizeof (dl_capab_dld_t));
+		dld.dld_version = DLD_CURRENT_VERSION;
+		dld.dld_capab = (uintptr_t)dld_capab;
+		dld.dld_capab_handle = (uintptr_t)dsp;
+
+		dlcapabsetqid(&(dld.dld_mid), dsp->ds_rq);
+		bcopy(&dld, ptr, sizeof (dl_capab_dld_t));
+		ptr += sizeof (dl_capab_dld_t);
+	}
+
+	ASSERT(ptr == mp->b_rptr + sizeof (dl_capability_ack_t) + subsize);
 	qreply(q, mp);
-	return (B_TRUE);
 }
 
 /*
@@ -1819,8 +1687,5 @@ void
 dld_capabilities_disable(dld_str_t *dsp)
 {
 	if (dsp->ds_polling)
-		proto_poll_disable(dsp);
-
-	if (dsp->ds_soft_ring)
-		proto_soft_ring_disable(dsp);
+		(void) dld_capab_poll_disable(dsp, NULL);
 }
diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c
index 8694b9d6c4..cf7e7010dc 100644
--- a/usr/src/uts/common/io/dld/dld_str.c
+++ b/usr/src/uts/common/io/dld/dld_str.c
@@ -27,17 +27,17 @@
  * Data-Link Driver
  */
 
+#include	<inet/common.h>
+#include	<sys/strsubr.h>
 #include	<sys/stropts.h>
 #include	<sys/strsun.h>
-#include	<sys/strsubr.h>
-#include	<sys/atomic.h>
-#include	<sys/disp.h>
-#include	<sys/callb.h>
 #include	<sys/vlan.h>
-#include	<sys/dld.h>
 #include	<sys/dld_impl.h>
-#include	<sys/dls_impl.h>
-#include	<inet/common.h>
+#include	<sys/cpuvar.h>
+#include	<sys/callb.h>
+#include	<sys/list.h>
+#include	<sys/mac_client.h>
+#include	<sys/mac_client_priv.h>
 
 static int	str_constructor(void *, void *, int);
 static void	str_destructor(void *, void *);
@@ -49,111 +49,80 @@ static void	str_notify_link_up(dld_str_t *);
 static void	str_notify_link_down(dld_str_t *);
 static void	str_notify_capab_reneg(dld_str_t *);
 static void	str_notify_speed(dld_str_t *, uint32_t);
-static void	str_notify(void *, mac_notify_type_t);
 
 static void	ioc_native(dld_str_t *,  mblk_t *);
 static void	ioc_margin(dld_str_t *, mblk_t *);
 static void	ioc_raw(dld_str_t *, mblk_t *);
 static void	ioc_fast(dld_str_t *,  mblk_t *);
 static void	ioc(dld_str_t *, mblk_t *);
-static void	dld_tx_enqueue(dld_str_t *, mblk_t *, mblk_t *, boolean_t,
-		    uint_t, uint_t);
+static void	dld_ioc(dld_str_t *, mblk_t *);
 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
-static void	dld_wput_nondata_task(void *);
-static void	dld_flush_nondata(dld_str_t *);
+
+static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t);
 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *);
 
 static uint32_t		str_count;
 static kmem_cache_t	*str_cachep;
-static taskq_t		*dld_disp_taskq = NULL;
 static mod_hash_t	*str_hashp;
 
 #define	STR_HASHSZ		64
 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
 
-static inline uint_t	mp_getsize(mblk_t *);
+#define	dld_taskq	system_taskq
 
-/*
- * Interval to count the TX queued depth. Default is 1s (1000000us).
- * Count the queue depth immediately (not by timeout) if this is set to 0.
- * See more details above dld_tx_enqueue().
- */
-uint_t tx_qdepth_interval = 1000000;
+static kmutex_t		dld_taskq_lock;
+static kcondvar_t	dld_taskq_cv;
+static list_t		dld_taskq_list;		/* List of dld_str_t */
+boolean_t		dld_taskq_quit;
+boolean_t		dld_taskq_done;
+
+static void		dld_taskq_dispatch(void);
 
 /*
- * Some notes on entry points, flow-control, queueing and locking:
+ * Some notes on entry points, flow-control, queueing.
  *
  * This driver exports the traditional STREAMS put entry point as well as
  * the non-STREAMS fast-path transmit routine which is provided to IP via
  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
  * and data operations, while the fast-path routine deals only with M_DATA
  * fast-path packets.  Regardless of the entry point, all outbound packets
- * will end up in dld_tx_single(), where they will be delivered to the MAC
- * driver.
+ * will end up in DLD_TX(), where they will be delivered to the MAC layer.
  *
- * The transmit logic operates in two modes: a "not busy" mode where the
- * packets will be delivered to the MAC for a send attempt, or "busy" mode
- * where they will be enqueued in the internal queue because of flow-control.
- * Flow-control happens when the MAC driver indicates the packets couldn't
- * be transmitted due to lack of resources (e.g. running out of descriptors).
- * In such case, the driver will place a dummy message on its write-side
- * STREAMS queue so that the queue is marked as "full".  Any subsequent
- * packets arriving at the driver will be enqueued in the internal queue,
- * which is drained in the context of the service thread that gets scheduled
- * whenever the driver is in the "busy" mode.  When all packets have been
- * successfully delivered by MAC and the internal queue is empty, it will
- * transition to the "not busy" mode by removing the dummy message from the
- * write-side STREAMS queue; in effect this will trigger backenabling.
- * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
- * to the above reasons.
+ * The transmit logic operates in the following way: All packets coming
+ * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
+ * happens when the MAC layer indicates the packets couldn't be
+ * transmitted due to 1) lack of resources (e.g. running out of
+ * descriptors),  or 2) reaching the allowed bandwidth limit for this
+ * particular flow. The indication comes in the form of a Tx cookie that
+ * identifies the blocked ring. In such case, DLD will place a
+ * dummy message on its write-side STREAMS queue so that the queue is
+ * marked as "full". Any subsequent packets arriving at the driver will
+ * still be sent to the MAC layer where it either gets queued in the Tx
+ * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
+ * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
+ * When the write service procedure runs, it will remove the dummy
+ * message from the write-side STREAMS queue; in effect this will trigger
+ * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
+ * respectively, due to the above reasons.
  *
- * The driver implements an internal transmit queue independent of STREAMS.
- * This allows for flexibility and provides a fast enqueue/dequeue mechanism
- * compared to the putq() and get() STREAMS interfaces.  The only putq() and
- * getq() operations done by the driver are those related to placing and
- * removing the dummy message to/from the write-side STREAMS queue for flow-
- * control purposes.
+ * All non-data operations, both DLPI and ioctls are single threaded on a per
+ * dld_str_t endpoint. This is done using a taskq so that the control operation
+ * has kernel context and can cv_wait for resources. In addition all set type
+ * operations that involve mac level state modification are serialized on a
+ * per mac end point using the perimeter mechanism provided by the mac layer.
+ * This serializes all mac clients trying to modify a single mac end point over
+ * the entire sequence of mac calls made by that client as an atomic unit. The
+ * mac framework locking is described in mac.c. A critical element is that
+ * DLD/DLS does not hold any locks across the mac perimeter.
  *
- * Locking is done independent of STREAMS due to the driver being fully MT.
- * Threads entering the driver (either from put or service entry points)
- * will most likely be readers, with the exception of a few writer cases
- * such those handling DLPI attach/detach/bind/unbind/etc. or any of the
- * DLD-related ioctl requests.  The DLPI detach case is special, because
- * it involves freeing resources and therefore must be single-threaded.
- * Unfortunately the readers/writers lock can't be used to protect against
- * it, because the lock is dropped prior to the driver calling places where
- * putnext() may be invoked, and such places may depend on those resources
- * to exist.  Because of this, the driver always completes the DLPI detach
- * process when there are no other threads running in the driver.  This is
- * done by keeping track of the number of threads, such that the the last
- * thread leaving the driver will finish the pending DLPI detach operation.
- */
-
-/*
- * dld_max_q_count is the queue depth threshold used to limit the number of
- * outstanding packets or bytes allowed in the queue; once this limit is
- * reached the driver will free any incoming ones until the queue depth
- * drops below the threshold.
- *
- * This buffering is provided to accomodate clients which do not employ
- * their own buffering scheme, and to handle occasional packet bursts.
- * Clients which handle their own buffering will receive positive feedback
- * from this driver as soon as it transitions into the "busy" state, i.e.
- * when the queue is initially filled up; they will get backenabled once
- * the queue is empty.
- *
- * The value chosen here is rather arbitrary; in future some intelligent
- * heuristics may be involved which could take into account the hardware's
- * transmit ring size, etc.
- */
-uint_t dld_max_q_count = (16 * 1024 *1024);
-
-/*
  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
  * match dev_t. If a stream is found and it is attached, its dev_info_t *
- * is returned.
+ * is returned. If the mac handle is non-null, it can be safely accessed
+ * below. The mac handle won't be freed until the mac_unregister which
+ * won't happen until the driver detaches. The DDI framework ensures that
+ * the detach won't happen while a getinfo is in progress.
  */
 typedef struct i_dld_str_state_s {
 	major_t		ds_major;
@@ -167,35 +136,31 @@ i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
 {
 	i_dld_str_state_t	*statep = arg;
 	dld_str_t		*dsp = (dld_str_t *)val;
+	mac_handle_t		mh;
 
 	if (statep->ds_major != dsp->ds_major)
 		return (MH_WALK_CONTINUE);
 
 	ASSERT(statep->ds_minor != 0);
+	mh = dsp->ds_mh;
 
-	/*
-	 * Access to ds_mh needs to be protected by ds_lock.
-	 */
-	rw_enter(&dsp->ds_lock, RW_READER);
 	if (statep->ds_minor == dsp->ds_minor) {
 		/*
 		 * Clone: a clone minor is unique. we can terminate the
 		 * walk if we find a matching stream -- even if we fail
 		 * to obtain the devinfo.
 		 */
-		if (dsp->ds_mh != NULL)
-			statep->ds_dip = mac_devinfo_get(dsp->ds_mh);
-		rw_exit(&dsp->ds_lock);
+		if (mh != NULL)
+			statep->ds_dip = mac_devinfo_get(mh);
 		return (MH_WALK_TERMINATE);
 	}
-	rw_exit(&dsp->ds_lock);
 	return (MH_WALK_CONTINUE);
 }
 
 static dev_info_t *
 dld_finddevinfo(dev_t dev)
 {
-	dev_info_t	*dip;
+	dev_info_t		*dip;
 	i_dld_str_state_t	state;
 
 	if (getminor(dev) == 0)
@@ -204,7 +169,7 @@ dld_finddevinfo(dev_t dev)
 	/*
 	 * See if it's a minor node of a link
 	 */
-	if ((dip = dls_finddevinfo(dev)) != NULL)
+	if ((dip = dls_link_devinfo(dev)) != NULL)
 		return (dip);
 
 	state.ds_minor = getminor(dev);
@@ -319,11 +284,24 @@ dld_close(queue_t *rq)
 	dld_str_t	*dsp = rq->q_ptr;
 
 	/*
+	 * All modules on top have been popped off. So there can't be any
+	 * threads from the top.
+	 */
+	ASSERT(dsp->ds_datathr_cnt == 0);
+
+	/*
+	 * Wait until pending DLPI requests are processed.
+	 */
+	mutex_enter(&dsp->ds_lock);
+	while (dsp->ds_dlpi_pending)
+		cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
+	mutex_exit(&dsp->ds_lock);
+
+	/*
 	 * Disable the queue srv(9e) routine.
 	 */
 	qprocsoff(rq);
 
-	dld_finish_pending_task(dsp);
 
 	/*
 	 * This stream was open to a provider node. Check to see
@@ -348,58 +326,55 @@ dld_close(queue_t *rq)
 void
 dld_wput(queue_t *wq, mblk_t *mp)
 {
-	dld_str_t	*dsp = wq->q_ptr;
+	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
+	dld_str_mode_t	mode;
 
 	switch (DB_TYPE(mp)) {
-	case M_DATA: {
-		dld_tx_t tx;
-
-		DLD_TX_ENTER(dsp);
-		if ((tx = dsp->ds_tx) != NULL)
-			tx(dsp, mp);
-		else
-			freemsg(mp);
-		DLD_TX_EXIT(dsp);
+	case M_DATA:
+		mutex_enter(&dsp->ds_lock);
+		if (dsp->ds_dlstate == DL_IDLE) {
+			mode = dsp->ds_mode;
+			if (mode == DLD_FASTPATH || mode == DLD_RAW) {
+				DLD_DATATHR_INC(dsp);
+				mutex_exit(&dsp->ds_lock);
+				if (mode == DLD_FASTPATH) {
+					(void) str_mdata_fastpath_put(dsp, mp,
+					    0, 0);
+				} else {
+					str_mdata_raw_put(dsp, mp);
+				}
+				DLD_DATATHR_DCR(dsp);
+				break;
+			}
+		}
+		mutex_exit(&dsp->ds_lock);
+		freemsg(mp);
 		break;
-	}
+
 	case M_PROTO:
 	case M_PCPROTO: {
 		t_uscalar_t	prim;
-		dld_tx_t	tx;
 
-		if (MBLKL(mp) < sizeof (t_uscalar_t)) {
-			freemsg(mp);
-			return;
-		}
+		if (MBLKL(mp) < sizeof (t_uscalar_t))
+			break;
 
 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
-		if (prim != DL_UNITDATA_REQ) {
-			/* Control path */
+
+		if (prim == DL_UNITDATA_REQ) {
+			proto_unitdata_req(dsp, mp);
+		} else {
 			dld_wput_nondata(dsp, mp);
-			break;
 		}
-
-		/* Data path */
-		DLD_TX_ENTER(dsp);
-		if ((tx = dsp->ds_unitdata_tx) != NULL)
-			tx(dsp, mp);
-		else
-			dlerrorack(wq, mp, DL_UNITDATA_REQ, DL_OUTSTATE, 0);
-		DLD_TX_EXIT(dsp);
 		break;
 	}
+
 	case M_IOCTL:
-	case M_IOCDATA:
-		/* Control path */
 		dld_wput_nondata(dsp, mp);
 		break;
+
 	case M_FLUSH:
-		/*
-		 * Flush both the data messages and the control messages.
-		 */
 		if (*mp->b_rptr & FLUSHW) {
-			dld_flush_nondata(dsp);
-			dld_tx_flush(dsp);
+			DLD_CLRQFULL(dsp);
 			*mp->b_rptr &= ~FLUSHW;
 		}
 
@@ -409,6 +384,7 @@ dld_wput(queue_t *wq, mblk_t *mp)
 			freemsg(mp);
 		}
 		break;
+
 	default:
 		freemsg(mp);
 		break;
@@ -416,122 +392,14 @@ dld_wput(queue_t *wq, mblk_t *mp)
 }
 
 /*
- * Called by GLDv3 control node to process the ioctls. It will start
- * a taskq to allow the ioctl processing to block. This is a temporary
- * solution, and will be replaced by a more graceful approach afterwards.
- */
-void
-dld_ioctl(queue_t *wq, mblk_t *mp)
-{
-	dld_wput_nondata(wq->q_ptr, mp);
-}
-
-/*
  * qi_srvp: srv(9e)
  */
 void
 dld_wsrv(queue_t *wq)
 {
-	mblk_t		*mp, *head, *tail;
 	dld_str_t	*dsp = wq->q_ptr;
-	uint_t		cnt, msgcnt;
-	timeout_id_t	tid = 0;
-
-	rw_enter(&dsp->ds_lock, RW_READER);
-	/*
-	 * Grab all packets (chained via b_next) off our transmit queue
-	 * and try to send them all to the MAC layer.  Since the queue
-	 * is independent of streams, we are able to dequeue all messages
-	 * at once without looping through getq() and manually chaining
-	 * them.  Note that the queue size parameters (byte and message
-	 * counts) are cleared as well, but we postpone the backenabling
-	 * until after the MAC transmit since some packets may end up
-	 * back at our transmit queue.
-	 */
-	mutex_enter(&dsp->ds_tx_list_lock);
-	if ((mp = dsp->ds_tx_list_head) == NULL) {
-		ASSERT(!dsp->ds_tx_qbusy);
-		ASSERT(dsp->ds_tx_flow_mp != NULL);
-		ASSERT(dsp->ds_tx_list_head == NULL);
-		ASSERT(dsp->ds_tx_list_tail == NULL);
-		ASSERT(dsp->ds_tx_cnt == 0);
-		ASSERT(dsp->ds_tx_msgcnt == 0);
-		mutex_exit(&dsp->ds_tx_list_lock);
-		rw_exit(&dsp->ds_lock);
-		return;
-	}
-	head = mp;
-	tail = dsp->ds_tx_list_tail;
-	dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
-	cnt = dsp->ds_tx_cnt;
-	msgcnt = dsp->ds_tx_msgcnt;
-	dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
-	mutex_exit(&dsp->ds_tx_list_lock);
-
-	/*
-	 * Discard packets unless we are attached and bound; note that
-	 * the driver mode (fastpath/raw/unitdata) is irrelevant here,
-	 * because regardless of the mode all transmit will end up in
-	 * dld_tx_single() where the packets may be queued.
-	 */
-	ASSERT((DB_TYPE(mp) == M_DATA) || (DB_TYPE(mp) == M_MULTIDATA));
-	if (dsp->ds_dlstate != DL_IDLE) {
-		freemsgchain(mp);
-		goto done;
-	}
-
-	/*
-	 * Attempt to transmit one or more packets.  If the MAC can't
-	 * send them all, re-queue the packet(s) at the beginning of
-	 * the transmit queue to avoid any re-ordering.
-	 */
-	mp = dls_tx(dsp->ds_dc, mp);
-	if (mp == head) {
-		/*
-		 * No message was sent out. Take the saved the queue depth
-		 * as the input, so that dld_tx_enqueue() need not to
-		 * calculate it again.
-		 */
-		dld_tx_enqueue(dsp, mp, tail, B_TRUE, msgcnt, cnt);
-	} else if (mp != NULL) {
-		/*
-		 * Some but not all messages were sent out. dld_tx_enqueue()
-		 * needs to start the timer to calculate the queue depth if
-		 * timer has not been started.
-		 *
-		 * Note that a timer is used to calculate the queue depth
-		 * to improve network performance, especially for TCP, in
-		 * which case packets are sent without canput() being checked,
-		 * and mostly end up in dld_tx_enqueue() under heavy load.
-		 */
-		dld_tx_enqueue(dsp, mp, tail, B_TRUE, 0, 0);
-	}
-
-done:
-	/*
-	 * Grab the list lock again and check if the transmit queue is
-	 * really empty; if so, lift up flow-control and backenable any
-	 * writer queues.  If the queue is not empty, schedule service
-	 * thread to drain it.
-	 */
-	mutex_enter(&dsp->ds_tx_list_lock);
-	if (dsp->ds_tx_list_head == NULL) {
-		dsp->ds_tx_flow_mp = getq(wq);
-		ASSERT(dsp->ds_tx_flow_mp != NULL);
-		dsp->ds_tx_qbusy = B_FALSE;
-		if ((tid = dsp->ds_tx_qdepth_tid) != 0)
-			dsp->ds_tx_qdepth_tid = 0;
-	}
-	mutex_exit(&dsp->ds_tx_list_lock);
-
-	/*
-	 * Note that ds_tx_list_lock (which is acquired by the timeout
-	 * callback routine) cannot be held across the call to untimeout().
-	 */
-	if (tid != 0)
-		(void) untimeout(tid);
 
-	rw_exit(&dsp->ds_lock);
+	DLD_CLRQFULL(dsp);
 }
 
 void
@@ -602,12 +470,6 @@ dld_str_init(void)
 	ASSERT(str_cachep != NULL);
 
 	/*
-	 * Create taskq to process DLPI requests.
-	 */
-	dld_disp_taskq = taskq_create("dld_disp_taskq", 1024, MINCLSYSPRI, 2,
-	    INT_MAX, TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
-
-	/*
 	 * Create a hash table for maintaining dld_str_t's.
 	 * The ds_minor field (the clone minor number) of a dld_str_t
 	 * is used as a key for this hash table because this number is
@@ -615,6 +477,16 @@ dld_str_init(void)
 	 */
 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
 	    mod_hash_null_valdtor);
+
+	mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
+
+	dld_taskq_quit = B_FALSE;
+	dld_taskq_done = B_FALSE;
+	list_create(&dld_taskq_list, sizeof (dld_str_t),
+	    offsetof(dld_str_t, ds_tqlist));
+	(void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
+	    &p0, TS_RUN, minclsyspri);
 }
 
 /*
@@ -629,10 +501,16 @@ dld_str_fini(void)
 	if (str_count != 0)
 		return (EBUSY);
 
-	ASSERT(dld_disp_taskq != NULL);
-	taskq_destroy(dld_disp_taskq);
-	dld_disp_taskq = NULL;
-
+	/*
+	 * Ask the dld_taskq thread to quit and wait for it to be done
+	 */
+	mutex_enter(&dld_taskq_lock);
+	dld_taskq_quit = B_TRUE;
+	cv_signal(&dld_taskq_cv);
+	while (!dld_taskq_done)
+		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
+	mutex_exit(&dld_taskq_lock);
+	list_destroy(&dld_taskq_list);
 	/*
 	 * Destroy object cache.
 	 */
@@ -668,7 +546,6 @@ dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
 	dsp->ds_type = type;
 	dsp->ds_major = major;
 	dsp->ds_style = style;
-	dsp->ds_tx = dsp->ds_unitdata_tx = NULL;
 
 	/*
 	 * Initialize the queue pointers.
@@ -690,20 +567,6 @@ dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
 	return (dsp);
 }
 
-void
-dld_finish_pending_task(dld_str_t *dsp)
-{
-	/*
-	 * Wait until the pending requests are processed by the worker thread.
-	 */
-	mutex_enter(&dsp->ds_disp_lock);
-	dsp->ds_closing = B_TRUE;
-	while (dsp->ds_tid != NULL)
-		cv_wait(&dsp->ds_disp_cv, &dsp->ds_disp_lock);
-	dsp->ds_closing = B_FALSE;
-	mutex_exit(&dsp->ds_disp_lock);
-}
-
 /*
  * Destroy a dld_str_t object.
  */
@@ -713,30 +576,29 @@ dld_str_destroy(dld_str_t *dsp)
 	queue_t		*rq;
 	queue_t		*wq;
 	mod_hash_val_t	val;
+
 	/*
 	 * Clear the queue pointers.
 	 */
 	rq = dsp->ds_rq;
 	wq = dsp->ds_wq;
 	ASSERT(wq == WR(rq));
-
 	rq->q_ptr = wq->q_ptr = NULL;
 	dsp->ds_rq = dsp->ds_wq = NULL;
 
-	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
-	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
-	ASSERT(dsp->ds_tx_list_head == NULL);
-	ASSERT(dsp->ds_tx_list_tail == NULL);
-	ASSERT(dsp->ds_tx_cnt == 0);
-	ASSERT(dsp->ds_tx_msgcnt == 0);
-	ASSERT(dsp->ds_tx_qdepth_tid == 0);
-	ASSERT(!dsp->ds_tx_qbusy);
+	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
+	ASSERT(dsp->ds_sap == 0);
+	ASSERT(dsp->ds_mh == NULL);
+	ASSERT(dsp->ds_mch == NULL);
+	ASSERT(dsp->ds_promisc == 0);
+	ASSERT(dsp->ds_mph == NULL);
+	ASSERT(dsp->ds_mip == NULL);
+	ASSERT(dsp->ds_mnh == NULL);
 
-	ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock));
-	ASSERT(dsp->ds_pending_head == NULL);
-	ASSERT(dsp->ds_pending_tail == NULL);
-	ASSERT(dsp->ds_tx == NULL);
-	ASSERT(dsp->ds_unitdata_tx == NULL);
+	ASSERT(dsp->ds_polling == B_FALSE);
+	ASSERT(dsp->ds_direct == B_FALSE);
+	ASSERT(dsp->ds_lso == B_FALSE);
+	ASSERT(dsp->ds_lso_max == 0);
 
 	/*
 	 * Reinitialize all the flags.
@@ -746,6 +608,18 @@ dld_str_destroy(dld_str_t *dsp)
 	dsp->ds_mode = DLD_UNITDATA;
 	dsp->ds_native = B_FALSE;
 
+	ASSERT(dsp->ds_datathr_cnt == 0);
+	ASSERT(dsp->ds_pending_head == NULL);
+	ASSERT(dsp->ds_pending_tail == NULL);
+	ASSERT(!dsp->ds_dlpi_pending);
+
+	ASSERT(dsp->ds_dlp == NULL);
+	ASSERT(dsp->ds_dmap == NULL);
+	ASSERT(dsp->ds_rx == NULL);
+	ASSERT(dsp->ds_rx_arg == NULL);
+	ASSERT(dsp->ds_next == NULL);
+	ASSERT(dsp->ds_head == NULL);
+
 	/*
 	 * Free the dummy mblk if exists.
 	 */
@@ -786,12 +660,9 @@ str_constructor(void *buf, void *cdrarg, int kmflags)
 	 */
 	dsp->ds_dlstate = DL_UNATTACHED;
 
-	rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL);
-	mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL);
-	mutex_init(&dsp->ds_disp_lock, NULL, MUTEX_DRIVER, NULL);
-	cv_init(&dsp->ds_disp_cv, NULL, CV_DRIVER, NULL);
-	mutex_init(&dsp->ds_tx_lock, NULL, MUTEX_DRIVER, NULL);
-	cv_init(&dsp->ds_tx_cv, NULL, CV_DRIVER, NULL);
+	mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
+	cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
 
 	return (0);
 }
@@ -806,78 +677,20 @@ str_destructor(void *buf, void *cdrarg)
 	dld_str_t	*dsp = buf;
 
 	/*
-	 * Make sure the DLPI state machine was reset.
-	 */
-	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
-
-	/*
-	 * Make sure the data-link interface was closed.
-	 */
-	ASSERT(dsp->ds_mh == NULL);
-	ASSERT(dsp->ds_dc == NULL);
-	ASSERT(dsp->ds_tx == NULL);
-	ASSERT(dsp->ds_unitdata_tx == NULL);
-	ASSERT(dsp->ds_intx_cnt == 0);
-	ASSERT(dsp->ds_detaching == B_FALSE);
-
-	/*
-	 * Make sure enabled notifications are cleared.
-	 */
-	ASSERT(dsp->ds_notifications == 0);
-
-	/*
-	 * Make sure polling is disabled.
-	 */
-	ASSERT(!dsp->ds_polling);
-
-	/*
 	 * Release the minor number.
 	 */
 	mac_minor_rele(dsp->ds_minor);
 
-	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
-	rw_destroy(&dsp->ds_lock);
-
-	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
-	mutex_destroy(&dsp->ds_tx_list_lock);
 	ASSERT(dsp->ds_tx_flow_mp == NULL);
-	ASSERT(dsp->ds_pending_head == NULL);
-	ASSERT(dsp->ds_pending_tail == NULL);
-	ASSERT(!dsp->ds_closing);
-
-	ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock));
-	mutex_destroy(&dsp->ds_disp_lock);
-	cv_destroy(&dsp->ds_disp_cv);
-
-	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_lock));
-	mutex_destroy(&dsp->ds_tx_lock);
-	cv_destroy(&dsp->ds_tx_cv);
-}
-
-void
-dld_tx_single(dld_str_t *dsp, mblk_t *mp)
-{
-	/*
-	 * If we are busy enqueue the packet and return.
-	 * Otherwise hand them over to the MAC driver for transmission.
-	 * If the message didn't get sent it will be queued.
-	 *
-	 * Note here that we don't grab the list lock prior to checking
-	 * the busy flag.  This is okay, because a missed transition
-	 * will not cause any packet reordering for any particular TCP
-	 * connection (which is single-threaded).  The enqueue routine
-	 * will atomically set the busy flag and schedule the service
-	 * thread to run; the flag is only cleared by the service thread
-	 * when there is no more packet to be transmitted.
-	 */
 
-	if (dsp->ds_tx_qbusy || ((mp = dls_tx(dsp->ds_dc, mp)) != NULL))
-		dld_tx_enqueue(dsp, mp, mp, B_FALSE, 1, mp_getsize(mp));
+	mutex_destroy(&dsp->ds_lock);
+	cv_destroy(&dsp->ds_datathr_cv);
+	cv_destroy(&dsp->ds_dlpi_pending_cv);
 }
 
 /*
  * Update the priority bits and VID (may need to insert tag if mp points
- * to an untagged packet).
+ * to an untagged packet.
  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
  */
 static mblk_t *
@@ -960,18 +773,16 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid)
 }
 
 /*
- * M_DATA put
- *
- * The poll callback function for DLS clients which are not in the per-stream
- * mode. This function is called from an upper layer protocol (currently only
- * tcp and udp).
+ * M_DATA put (IP fast-path mode)
  */
-void
-str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
+mac_tx_cookie_t
+str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
+    uint16_t flag)
 {
 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
 	mblk_t *newmp;
 	uint_t pri;
+	mac_tx_cookie_t cookie;
 
 	if (is_ethernet) {
 		/*
@@ -988,25 +799,28 @@ str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
 		}
 	}
 
-	dld_tx_single(dsp, mp);
-	return;
+	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
+		DLD_SETQFULL(dsp);
+	}
+	return (cookie);
 
 discard:
 	/* TODO: bump kstat? */
 	freemsg(mp);
+	return (NULL);
 }
 
 /*
- * M_DATA put (DLIOCRAW mode).
+ * M_DATA put (DLIOCRAW mode)
  */
-void
+static void
 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
 {
 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
 	mblk_t *bp, *newmp;
 	size_t size;
 	mac_header_info_t mhi;
-	uint_t pri, vid;
+	uint_t pri, vid, dvid;
 	uint_t max_sdu;
 
 	/*
@@ -1039,7 +853,7 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
 		size += MBLKL(bp);
 	}
 
-	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
+	if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
 		goto discard;
 
 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
@@ -1052,12 +866,14 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
 		goto discard;
 
 	if (is_ethernet) {
+		dvid = mac_client_vid(dsp->ds_mch);
+
 		/*
 		 * Discard the packet if this is a VLAN stream but the VID in
 		 * the packet is not correct.
 		 */
 		vid = VLAN_ID(mhi.mhi_tci);
-		if ((dsp->ds_vid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
+		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
 			goto discard;
 
 		/*
@@ -1074,16 +890,19 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
 		 * packets on a VLAN stream.
 		 */
 		pri = (pri == 0) ? dsp->ds_pri : 0;
-		if ((pri != 0) || (dsp->ds_vid != VLAN_ID_NONE)) {
+		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
 			if ((newmp = i_dld_ether_header_update_tag(mp,
-			    pri, dsp->ds_vid)) == NULL) {
+			    pri, dvid)) == NULL) {
 				goto discard;
 			}
 			mp = newmp;
 		}
 	}
 
-	dld_tx_single(dsp, mp);
+	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
+		/* Turn on flow-control for dld */
+		DLD_SETQFULL(dsp);
+	}
 	return;
 
 discard:
@@ -1097,18 +916,21 @@ discard:
 int
 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
 {
-	dev_t				dev;
-	int				err;
-	const char			*drvname;
-	dls_channel_t			dc;
-	uint_t				addr_length;
-	boolean_t			qassociated = B_FALSE;
-
-	ASSERT(dsp->ds_dc == NULL);
+	dev_t			dev;
+	int			err;
+	const char		*drvname;
+	mac_perim_handle_t	mph;
+	boolean_t		qassociated = B_FALSE;
+	dls_link_t		*dlp = NULL;
+	dls_dl_handle_t		ddp = NULL;
+	boolean_t		entered_perim = B_FALSE;
 
 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
 		return (EINVAL);
 
+	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
+		return (ENOTSUP);
+
 	/*
 	 * /dev node access. This will still be supported for backward
 	 * compatibility reason.
@@ -1120,46 +942,22 @@ dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
 		qassociated = B_TRUE;
 	}
 
-	/*
-	 * Open a channel.
-	 */
-	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA) {
-		/*
-		 * style-2 VLAN open, this is a /dev VLAN ppa open
-		 * which might result in a newly created dls_vlan_t.
-		 */
-		err = dls_open_style2_vlan(dsp->ds_major, ppa, &dc);
-		if (err != 0) {
-			if (qassociated)
-				(void) qassociate(dsp->ds_wq, -1);
-			return (err);
-		}
-	} else {
-		dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
-		if ((err = dls_open_by_dev(dev, &dc)) != 0) {
-			if (qassociated)
-				(void) qassociate(dsp->ds_wq, -1);
-			return (err);
-		}
-	}
-
-	/*
-	 * Cache the MAC interface handle, a pointer to the immutable MAC
-	 * information and the current and 'factory' MAC address.
-	 */
-	dsp->ds_mh = dls_mac(dc);
-	dsp->ds_mip = mac_info(dsp->ds_mh);
-
-	mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
+	dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
+	if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
+		goto failed;
 
-	addr_length = dsp->ds_mip->mi_addr_length;
-	bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length);
+	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
+		goto failed;
+	entered_perim = B_TRUE;
 
 	/*
-	 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
-	 * a non-VLAN interface).
+	 * Open a channel.
 	 */
-	dsp->ds_vid = dls_vid(dc);
+	if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
+		goto failed;
+
+	if ((err = dls_open(dlp, ddp, dsp)) != 0)
+		goto failed;
 
 	/*
 	 * Set the default packet priority.
@@ -1169,12 +967,22 @@ dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
 	/*
 	 * Add a notify function so that the we get updates from the MAC.
 	 */
-	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp);
-
-	dsp->ds_dc = dc;
+	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
 	dsp->ds_dlstate = DL_UNBOUND;
-
+	mac_perim_exit(mph);
 	return (0);
+
+failed:
+	if (dlp != NULL)
+		dls_link_rele(dlp);
+	if (entered_perim)
+		mac_perim_exit(mph);
+	if (ddp != NULL)
+		dls_devnet_rele(ddp);
+	if (qassociated)
+		(void) qassociate(dsp->ds_wq, -1);
+
+	return (err);
 }
 
 /*
@@ -1184,35 +992,56 @@ dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
 void
 dld_str_detach(dld_str_t *dsp)
 {
+	mac_perim_handle_t	mph;
+	int			err;
+
+	ASSERT(dsp->ds_datathr_cnt == 0);
+
+	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
 	/*
 	 * Remove the notify function.
+	 *
+	 * Note that we cannot wait for the notification callback to be removed
+	 * since it could cause the deadlock with str_notify() since they both
+	 * need the mac perimeter. Continue if we cannot remove the
+	 * notification callback right now and wait after we leave the
+	 * perimeter.
 	 */
-	mac_notify_remove(dsp->ds_mh, dsp->ds_mnh);
+	err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
+	dsp->ds_mnh = NULL;
 
 	/*
-	 * Disable the capabilities and clear the promisc flag.
+	 * Disable the capabilities
 	 */
-	ASSERT(!dsp->ds_polling);
-	ASSERT(!dsp->ds_soft_ring);
 	dld_capabilities_disable(dsp);
-	dsp->ds_promisc = 0;
 
-	DLD_TX_QUIESCE(dsp);
+	/*
+	 * Clear LSO flags.
+	 */
+	dsp->ds_lso = B_FALSE;
+	dsp->ds_lso_max = 0;
+
+	dls_close(dsp);
+	mac_perim_exit(mph);
 
 	/*
-	 * Flush all pending packets which are sitting in the transmit queue.
+	 * Now we leave the mac perimeter. If mac_notify_remove() failed
+	 * because the notification callback was in progress, wait for
+	 * it to finish before we proceed.
 	 */
-	dld_tx_flush(dsp);
+	if (err != 0)
+		mac_notify_remove_wait(dsp->ds_mh);
 
 	/*
-	 * Clear LSO flags.
+	 * An unreferenced tagged (non-persistent) vlan gets destroyed
+	 * automatically in the call to dls_devnet_rele.
 	 */
-	dsp->ds_lso = B_FALSE;
-	dsp->ds_lso_max = 0;
+	dls_devnet_rele(dsp->ds_ddh);
 
-	dls_close(dsp->ds_dc);
-	dsp->ds_dc = NULL;
+	dsp->ds_sap = 0;
 	dsp->ds_mh = NULL;
+	dsp->ds_mch = NULL;
+	dsp->ds_mip = NULL;
 
 	if (dsp->ds_style == DL_STYLE2)
 		(void) qassociate(dsp->ds_wq, -1);
@@ -1221,7 +1050,6 @@ dld_str_detach(dld_str_t *dsp)
 	 * Re-initialize the DLPI state machine.
 	 */
 	dsp->ds_dlstate = DL_UNATTACHED;
-
 }
 
 /*
@@ -1314,7 +1142,8 @@ dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
 		/*
 		 * Strip the VLAN tag for VLAN streams.
 		 */
-		if (is_ethernet && dsp->ds_vid != VLAN_ID_NONE) {
+		if (is_ethernet &&
+		    mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
 			newmp = i_dld_ether_header_strip_tag(mp);
 			if (newmp == NULL) {
 				freemsg(mp);
@@ -1366,7 +1195,8 @@ dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
 	 *	  * Otherwise, strip the whole VLAN header.
 	 *    - Untagged packets. Strip the whole MAC header.
 	 */
-	if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
+	if (mhip->mhi_istagged &&
+	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
 		offset = VLAN_TAGSZ;
@@ -1418,7 +1248,8 @@ dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
 	/*
 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
 	 */
-	if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
+	if (mhip->mhi_istagged &&
+	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
 		offset = VLAN_TAGSZ;
@@ -1534,7 +1365,7 @@ str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
 	/*
 	 * Get the packet header information.
 	 */
-	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
+	if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
 		return (NULL);
 
 	/*
@@ -1805,11 +1636,14 @@ str_notify_fastpath_flush(dld_str_t *dsp)
 /*
  * MAC notification callback.
  */
-static void
+void
 str_notify(void *arg, mac_notify_type_t type)
 {
 	dld_str_t		*dsp = (dld_str_t *)arg;
 	queue_t			*q = dsp->ds_wq;
+	mac_handle_t		mh = dsp->ds_mh;
+	mac_client_handle_t	mch = dsp->ds_mch;
+	uint8_t			addr[MAXMACADDRLEN];
 
 	switch (type) {
 	case MAC_NOTE_TX:
@@ -1820,26 +1654,23 @@ str_notify(void *arg, mac_notify_type_t type)
 		/*
 		 * Send the appropriate DL_NOTIFY_IND.
 		 */
-		if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC))
+		if (mac_promisc_get(mh, MAC_DEVPROMISC))
 			str_notify_promisc_on_phys(dsp);
 		else
 			str_notify_promisc_off_phys(dsp);
 		break;
 
-	case MAC_NOTE_PROMISC:
-		break;
-
 	case MAC_NOTE_UNICST:
 		/*
-		 * This notification is sent whenever the MAC unicast address
-		 * changes. We need to re-cache the address.
+		 * This notification is sent whenever the MAC unicast
+		 * address changes.
 		 */
-		mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
+		mac_unicast_primary_get(mh, addr);
 
 		/*
 		 * Send the appropriate DL_NOTIFY_IND.
 		 */
-		str_notify_phys_addr(dsp, dsp->ds_curr_addr);
+		str_notify_phys_addr(dsp, addr);
 		break;
 
 	case MAC_NOTE_LINK:
@@ -1847,7 +1678,7 @@ str_notify(void *arg, mac_notify_type_t type)
 		 * This notification is sent every time the MAC driver
 		 * updates the link state.
 		 */
-		switch (mac_link_get(dsp->ds_mh)) {
+		switch (mac_client_stat_get(mch, MAC_STAT_LINK_STATE)) {
 		case LINK_STATE_UP: {
 			uint64_t speed;
 			/*
@@ -1856,7 +1687,7 @@ str_notify(void *arg, mac_notify_type_t type)
 			 */
 			str_notify_link_up(dsp);
 
-			speed = mac_stat_get(dsp->ds_mh, MAC_STAT_IFSPEED);
+			speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
 			break;
 		}
@@ -1874,7 +1705,7 @@ str_notify(void *arg, mac_notify_type_t type)
 		break;
 
 	case MAC_NOTE_RESOURCE:
-	case MAC_NOTE_VNIC:
+	case MAC_NOTE_CAPAB_CHG:
 		/*
 		 * This notification is sent whenever the MAC resources
 		 * change or capabilities change. We need to renegotiate
@@ -1897,334 +1728,177 @@ str_notify(void *arg, mac_notify_type_t type)
 	case MAC_NOTE_MARGIN:
 		break;
 
+	case MAC_NOTE_PROMISC:
+		break;
+
 	default:
 		ASSERT(B_FALSE);
 		break;
 	}
 }
 
-static inline uint_t
-mp_getsize(mblk_t *mp)
-{
-	ASSERT(DB_TYPE(mp) == M_DATA);
-	return ((mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp));
-}
-
 /*
- * Calculate the dld queue depth, free the messages that exceed the threshold.
+ * This function is called via a taskq mechansim to process all control
+ * messages on a per 'dsp' end point.
  */
 static void
-dld_tx_qdepth_timer(void *arg)
+dld_wput_nondata_task(void *arg)
 {
-	dld_str_t *dsp = (dld_str_t *)arg;
-	mblk_t *prev, *mp;
-	uint_t cnt, msgcnt, size;
-
-	mutex_enter(&dsp->ds_tx_list_lock);
-
-	/* Calculate total size and count of the packet(s) */
-	cnt = msgcnt = 0;
-	for (prev = NULL, mp = dsp->ds_tx_list_head; mp != NULL;
-	    prev = mp, mp = mp->b_next) {
-		size = mp_getsize(mp);
-		cnt += size;
-		msgcnt++;
-		if (cnt >= dld_max_q_count || msgcnt >= dld_max_q_count) {
-			ASSERT(dsp->ds_tx_qbusy);
-			dsp->ds_tx_list_tail = prev;
-			if (prev == NULL)
-				dsp->ds_tx_list_head = NULL;
-			else
-				prev->b_next = NULL;
-			freemsgchain(mp);
-			cnt -= size;
-			msgcnt--;
+	dld_str_t	*dsp = arg;
+	mblk_t		*mp;
+
+	mutex_enter(&dsp->ds_lock);
+	while (dsp->ds_pending_head != NULL) {
+		mp = dsp->ds_pending_head;
+		dsp->ds_pending_head = mp->b_next;
+		mp->b_next = NULL;
+		if (dsp->ds_pending_head == NULL)
+			dsp->ds_pending_tail = NULL;
+		mutex_exit(&dsp->ds_lock);
+
+		switch (DB_TYPE(mp)) {
+		case M_PROTO:
+		case M_PCPROTO:
+			dld_proto(dsp, mp);
 			break;
+		case M_IOCTL:
+			dld_ioc(dsp, mp);
+			break;
+		default:
+			ASSERT(0);
 		}
+
+		mutex_enter(&dsp->ds_lock);
 	}
-	dsp->ds_tx_cnt = cnt;
-	dsp->ds_tx_msgcnt = msgcnt;
-	dsp->ds_tx_qdepth_tid = 0;
-	mutex_exit(&dsp->ds_tx_list_lock);
+	ASSERT(dsp->ds_pending_tail == NULL);
+	dsp->ds_dlpi_pending = 0;
+	cv_broadcast(&dsp->ds_dlpi_pending_cv);
+	mutex_exit(&dsp->ds_lock);
 }
 
 /*
- * Enqueue one or more messages on the transmit queue. Caller specifies:
- *  - the insertion position (head/tail).
- *  - the message count and the total message size of messages to be queued
- *    if they are known to the caller; or 0 if they are not known.
- *
- * If the caller does not know the message size information, this usually
- * means that dld_wsrv() managed to send some but not all of the queued
- * messages. For performance reasons, we do not calculate the queue depth
- * every time. Instead, a timer is started to calculate the queue depth
- * every 1 second (can be changed by tx_qdepth_interval).
+ * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
+ * thread is started at boot time.
  */
 static void
-dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, mblk_t *tail, boolean_t head_insert,
-    uint_t msgcnt, uint_t cnt)
+dld_taskq_dispatch(void)
 {
-	queue_t *q = dsp->ds_wq;
-	uint_t tot_cnt, tot_msgcnt;
-	mblk_t *next;
-
-	mutex_enter(&dsp->ds_tx_list_lock);
-
-	/*
-	 * Simply enqueue the message and calculate the queue depth via
-	 * timer if:
-	 *
-	 * - the current queue depth is incorrect, and the timer is already
-	 *   started; or
-	 *
-	 * - the given message size is unknown and it is allowed to start the
-	 *   timer;
-	 */
-	if ((dsp->ds_tx_qdepth_tid != 0) ||
-	    (msgcnt == 0 && tx_qdepth_interval != 0)) {
-		goto enqueue;
-	}
+	callb_cpr_t	cprinfo;
+	dld_str_t	*dsp;
 
-	/*
-	 * The timer is not allowed, so calculate the message size now.
-	 */
-	if (msgcnt == 0) {
-		for (next = mp; next != NULL; next = next->b_next) {
-			cnt += mp_getsize(next);
-			msgcnt++;
+	CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
+	    "dld_taskq_dispatch");
+	mutex_enter(&dld_taskq_lock);
+
+	while (!dld_taskq_quit) {
+		dsp = list_head(&dld_taskq_list);
+		while (dsp != NULL) {
+			list_remove(&dld_taskq_list, dsp);
+			mutex_exit(&dld_taskq_lock);
+			VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
+			    dsp, TQ_SLEEP) != 0);
+			mutex_enter(&dld_taskq_lock);
+			dsp = list_head(&dld_taskq_list);
 		}
-	}
-
-	/*
-	 * Grow the queue depth using the input messesge size.
-	 *
-	 * If the queue depth would exceed the allowed threshold, drop
-	 * new packet(s) and drain those already in the queue.
-	 */
-	tot_cnt = dsp->ds_tx_cnt + cnt;
-	tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt;
-
-	if (!head_insert && (tot_cnt >= dld_max_q_count ||
-	    tot_msgcnt >= dld_max_q_count)) {
-		ASSERT(dsp->ds_tx_qbusy);
-		mutex_exit(&dsp->ds_tx_list_lock);
-		freemsgchain(mp);
-		goto done;
-	}
-	/* Update the queue size parameters */
-	dsp->ds_tx_cnt = tot_cnt;
-	dsp->ds_tx_msgcnt = tot_msgcnt;
-
-enqueue:
-	/*
-	 * If the transmit queue is currently empty and we are
-	 * about to deposit the packet(s) there, switch mode to
-	 * "busy" and raise flow-control condition.
-	 */
-	if (!dsp->ds_tx_qbusy) {
-		dsp->ds_tx_qbusy = B_TRUE;
-		ASSERT(dsp->ds_tx_flow_mp != NULL);
-		(void) putq(q, dsp->ds_tx_flow_mp);
-		dsp->ds_tx_flow_mp = NULL;
-	}
-
-	if (!head_insert) {
-		/* Tail insertion */
-		if (dsp->ds_tx_list_head == NULL)
-			dsp->ds_tx_list_head = mp;
-		else
-			dsp->ds_tx_list_tail->b_next = mp;
-		dsp->ds_tx_list_tail = tail;
-	} else {
-		/* Head insertion */
-		tail->b_next = dsp->ds_tx_list_head;
-		if (dsp->ds_tx_list_head == NULL)
-			dsp->ds_tx_list_tail = tail;
-		dsp->ds_tx_list_head = mp;
-	}
-
-	if (msgcnt == 0 && dsp->ds_tx_qdepth_tid == 0 &&
-	    tx_qdepth_interval != 0) {
-		/*
-		 * The message size is not given so that we need to start
-		 * the timer to calculate the queue depth.
-		 */
-		dsp->ds_tx_qdepth_tid = timeout(dld_tx_qdepth_timer, dsp,
-		    drv_usectohz(tx_qdepth_interval));
-		ASSERT(dsp->ds_tx_qdepth_tid != NULL);
-	}
-	mutex_exit(&dsp->ds_tx_list_lock);
-done:
-	/* Schedule service thread to drain the transmit queue */
-	if (!head_insert)
-		qenable(q);
-}
 
-void
-dld_tx_flush(dld_str_t *dsp)
-{
-	timeout_id_t	tid = 0;
-
-	mutex_enter(&dsp->ds_tx_list_lock);
-	if (dsp->ds_tx_list_head != NULL) {
-		freemsgchain(dsp->ds_tx_list_head);
-		dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
-		dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
-		if (dsp->ds_tx_qbusy) {
-			dsp->ds_tx_flow_mp = getq(dsp->ds_wq);
-			ASSERT(dsp->ds_tx_flow_mp != NULL);
-			dsp->ds_tx_qbusy = B_FALSE;
-		}
-		if ((tid = dsp->ds_tx_qdepth_tid) != 0)
-			dsp->ds_tx_qdepth_tid = 0;
+		CALLB_CPR_SAFE_BEGIN(&cprinfo);
+		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
+		CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
 	}
-	mutex_exit(&dsp->ds_tx_list_lock);
 
-	/*
-	 * Note that ds_tx_list_lock (which is acquired by the timeout
-	 * callback routine) cannot be held across the call to untimeout().
-	 */
-	if (tid != 0)
-		(void) untimeout(tid);
+	dld_taskq_done = B_TRUE;
+	cv_signal(&dld_taskq_cv);
+	CALLB_CPR_EXIT(&cprinfo);
+	thread_exit();
 }
 
 /*
- * Process a non-data message.
+ * All control operations are serialized on the 'dsp' and are also funneled
+ * through a taskq mechanism to ensure that subsequent processing has kernel
+ * context and can safely use cv_wait.
+ *
+ * Mechanisms to handle taskq dispatch failures
+ *
+ * The only way to be sure that taskq dispatch does not fail is to either
+ * specify TQ_SLEEP or to use a static taskq and prepopulate it with
+ * some number of entries and make sure that the number of outstanding requests
+ * are less than that number. We can't use TQ_SLEEP since we don't know the
+ * context. Nor can we bound the total number of 'dsp' end points. So we are
+ * unable to use either of the above schemes, and are forced to deal with
+ * taskq dispatch failures. Note that even dynamic taskq could fail in
+ * dispatch if TQ_NOSLEEP is specified, since this flag is translated
+ * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
+ * framework.
+ *
+ * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
+ * We also have a single global thread to retry the taskq dispatch. This
+ * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
+ * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
  */
 static void
 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
 {
-	ASSERT((dsp->ds_type == DLD_DLPI && dsp->ds_ioctl == NULL) ||
-	    (dsp->ds_type == DLD_CONTROL && dsp->ds_ioctl != NULL));
-
-	mutex_enter(&dsp->ds_disp_lock);
-
-	/*
-	 * The processing of the message might block. Enqueue the
-	 * message for later processing.
-	 */
-	if (dsp->ds_pending_head == NULL) {
-		dsp->ds_pending_head = dsp->ds_pending_tail = mp;
-	} else {
+	ASSERT(mp->b_next == NULL);
+	mutex_enter(&dsp->ds_lock);
+	if (dsp->ds_pending_head != NULL) {
+		ASSERT(dsp->ds_dlpi_pending);
 		dsp->ds_pending_tail->b_next = mp;
 		dsp->ds_pending_tail = mp;
+		mutex_exit(&dsp->ds_lock);
+		return;
 	}
-
+	ASSERT(dsp->ds_pending_tail == NULL);
+	dsp->ds_pending_head = dsp->ds_pending_tail = mp;
 	/*
-	 * If there is no task pending, kick off the task.
+	 * At this point if ds_dlpi_pending is set, it implies that the taskq
+	 * thread is still active and is processing the last message, though
+	 * the pending queue has been emptied.
 	 */
-	if (dsp->ds_tid == NULL) {
-		dsp->ds_tid = taskq_dispatch(dld_disp_taskq,
-		    dld_wput_nondata_task, dsp, TQ_SLEEP);
-		ASSERT(dsp->ds_tid != NULL);
+	if (dsp->ds_dlpi_pending) {
+		mutex_exit(&dsp->ds_lock);
+		return;
 	}
-	mutex_exit(&dsp->ds_disp_lock);
+
+	dsp->ds_dlpi_pending = 1;
+	mutex_exit(&dsp->ds_lock);
+
+	if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
+	    TQ_NOSLEEP) != 0)
+		return;
+
+	mutex_enter(&dld_taskq_lock);
+	list_insert_tail(&dld_taskq_list, dsp);
+	cv_signal(&dld_taskq_cv);
+	mutex_exit(&dld_taskq_lock);
 }
 
 /*
- * The worker thread which processes non-data messages. Note we only process
- * one message at one time in order to be able to "flush" the queued message
- * and serialize the processing.
+ * Process an M_IOCTL message.
  */
 static void
-dld_wput_nondata_task(void *arg)
+dld_ioc(dld_str_t *dsp, mblk_t *mp)
 {
-	dld_str_t	*dsp = (dld_str_t *)arg;
-	mblk_t		*mp;
-
-	mutex_enter(&dsp->ds_disp_lock);
-	ASSERT(dsp->ds_pending_head != NULL);
-	ASSERT(dsp->ds_tid != NULL);
-
-	if (dsp->ds_closing)
-		goto closing;
-
-	mp = dsp->ds_pending_head;
-	if ((dsp->ds_pending_head = mp->b_next) == NULL)
-		dsp->ds_pending_tail = NULL;
-	mp->b_next = NULL;
+	uint_t			cmd;
 
-	mutex_exit(&dsp->ds_disp_lock);
+	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
+	ASSERT(dsp->ds_type == DLD_DLPI);
 
-	switch (DB_TYPE(mp)) {
-	case M_PROTO:
-	case M_PCPROTO:
-		ASSERT(dsp->ds_type == DLD_DLPI);
-		dld_wput_proto_nondata(dsp, mp);
+	switch (cmd) {
+	case DLIOCNATIVE:
+		ioc_native(dsp, mp);
 		break;
-	case M_IOCTL: {
-		uint_t cmd;
-
-		if (dsp->ds_type == DLD_CONTROL) {
-			ASSERT(dsp->ds_ioctl != NULL);
-			dsp->ds_ioctl(dsp->ds_wq, mp);
-			break;
-		}
-
-		cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
-
-		switch (cmd) {
-		case DLIOCNATIVE:
-			ioc_native(dsp, mp);
-			break;
-		case DLIOCMARGININFO:
-			ioc_margin(dsp, mp);
-			break;
-		case DLIOCRAW:
-			ioc_raw(dsp, mp);
-			break;
-		case DLIOCHDRINFO:
-			ioc_fast(dsp, mp);
-			break;
-		default:
-			ioc(dsp, mp);
-			break;
-		}
+	case DLIOCMARGININFO:
+		ioc_margin(dsp, mp);
 		break;
-	}
-	case M_IOCDATA:
-		ASSERT(dsp->ds_type == DLD_DLPI);
-		ioc(dsp, mp);
+	case DLIOCRAW:
+		ioc_raw(dsp, mp);
 		break;
+	case DLIOCHDRINFO:
+		ioc_fast(dsp, mp);
+		break;
+	default:
+		ioc(dsp, mp);
 	}
-
-	mutex_enter(&dsp->ds_disp_lock);
-
-	if (dsp->ds_closing)
-		goto closing;
-
-	if (dsp->ds_pending_head != NULL) {
-		dsp->ds_tid = taskq_dispatch(dld_disp_taskq,
-		    dld_wput_nondata_task, dsp, TQ_SLEEP);
-		ASSERT(dsp->ds_tid != NULL);
-	} else {
-		dsp->ds_tid = NULL;
-	}
-	mutex_exit(&dsp->ds_disp_lock);
-	return;
-
-	/*
-	 * If the stream is closing, flush all queued messages and inform
-	 * the stream once it is done.
-	 */
-closing:
-	freemsgchain(dsp->ds_pending_head);
-	dsp->ds_pending_head = dsp->ds_pending_tail = NULL;
-	dsp->ds_tid = NULL;
-	cv_signal(&dsp->ds_disp_cv);
-	mutex_exit(&dsp->ds_disp_lock);
-}
-
-/*
- * Flush queued non-data messages.
- */
-static void
-dld_flush_nondata(dld_str_t *dsp)
-{
-	mutex_enter(&dsp->ds_disp_lock);
-	freemsgchain(dsp->ds_pending_head);
-	dsp->ds_pending_head = dsp->ds_pending_tail = NULL;
-	mutex_exit(&dsp->ds_disp_lock);
 }
 
 /*
@@ -2236,8 +1910,6 @@ ioc_native(dld_str_t *dsp, mblk_t *mp)
 	queue_t *q = dsp->ds_wq;
 	const mac_info_t *mip = dsp->ds_mip;
 
-	rw_enter(&dsp->ds_lock, RW_WRITER);
-
 	/*
 	 * Native mode can be enabled if it's disabled and if the
 	 * native media type is different.
@@ -2245,8 +1917,6 @@ ioc_native(dld_str_t *dsp, mblk_t *mp)
 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
 		dsp->ds_native = B_TRUE;
 
-	rw_exit(&dsp->ds_lock);
-
 	if (dsp->ds_native)
 		miocack(q, mp, 0, mip->mi_nativemedia);
 	else
@@ -2286,22 +1956,34 @@ static void
 ioc_raw(dld_str_t *dsp, mblk_t *mp)
 {
 	queue_t *q = dsp->ds_wq;
+	mac_perim_handle_t	mph;
+
+	if (dsp->ds_mh == NULL) {
+		dsp->ds_mode = DLD_RAW;
+		miocack(q, mp, 0, 0);
+		return;
+	}
 
-	if (dsp->ds_polling || dsp->ds_soft_ring) {
+	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+	if (dsp->ds_polling || dsp->ds_direct) {
+		mac_perim_exit(mph);
 		miocnak(q, mp, 0, EPROTO);
 		return;
 	}
 
-	rw_enter(&dsp->ds_lock, RW_WRITER);
-	if ((dsp->ds_mode != DLD_RAW) && (dsp->ds_dlstate == DL_IDLE)) {
+	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
 		/*
 		 * Set the receive callback.
 		 */
-		dls_rx_set(dsp->ds_dc, dld_str_rx_raw, dsp);
-		dsp->ds_tx = str_mdata_raw_put;
+		dls_rx_set(dsp, dld_str_rx_raw, dsp);
 	}
+
+	/*
+	 * Note that raw mode is enabled.
+	 */
 	dsp->ds_mode = DLD_RAW;
-	rw_exit(&dsp->ds_lock);
+	mac_perim_exit(mph);
+
 	miocack(q, mp, 0, 0);
 }
 
@@ -2321,6 +2003,7 @@ ioc_fast(dld_str_t *dsp, mblk_t *mp)
 	uint_t		addr_length;
 	queue_t		*q = dsp->ds_wq;
 	int		err;
+	mac_perim_handle_t	mph;
 
 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
 		err = ENOTSUP;
@@ -2352,11 +2035,6 @@ ioc_fast(dld_str_t *dsp, mblk_t *mp)
 		goto failed;
 	}
 
-	/*
-	 * We don't need to hold any locks to access ds_dlstate, because
-	 * control message prossessing (which updates this field) is
-	 * serialized.
-	 */
 	if (dsp->ds_dlstate != DL_IDLE) {
 		err = ENOTSUP;
 		goto failed;
@@ -2371,24 +2049,31 @@ ioc_fast(dld_str_t *dsp, mblk_t *mp)
 	addr = nmp->b_rptr + off;
 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
 
-	if ((hmp = dls_header(dsp->ds_dc, addr, sap, 0, NULL)) == NULL) {
+	if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
 		err = ENOMEM;
 		goto failed;
 	}
 
-	rw_enter(&dsp->ds_lock, RW_WRITER);
-	ASSERT(dsp->ds_dlstate == DL_IDLE);
+	/*
+	 * This ioctl might happen concurrently with a direct call to dld_capab
+	 * that tries to enable direct and/or poll capabilities. Since the
+	 * stack does not serialize them, we do so here to avoid mixing
+	 * the callbacks.
+	 */
+	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
 	if (dsp->ds_mode != DLD_FASTPATH) {
 		/*
-		 * Set the receive callback (unless polling or
-		 * soft-ring is enabled).
+		 * Set the receive callback (unless polling is enabled).
+		 */
+		if (!dsp->ds_polling && !dsp->ds_direct)
+			dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
+
+		/*
+		 * Note that fast-path mode is enabled.
 		 */
 		dsp->ds_mode = DLD_FASTPATH;
-		if (!dsp->ds_polling && !dsp->ds_soft_ring)
-			dls_rx_set(dsp->ds_dc, dld_str_rx_fastpath, dsp);
-		dsp->ds_tx = str_mdata_fastpath_put;
 	}
-	rw_exit(&dsp->ds_lock);
+	mac_perim_exit(mph);
 
 	freemsg(nmp->b_cont);
 	nmp->b_cont = hmp;
@@ -2399,17 +2084,17 @@ failed:
 	miocnak(q, mp, 0, err);
 }
 
+/*
+ * Catch-all handler.
+ */
 static void
 ioc(dld_str_t *dsp, mblk_t *mp)
 {
 	queue_t	*q = dsp->ds_wq;
-	mac_handle_t mh;
 
 	if (dsp->ds_dlstate == DL_UNATTACHED) {
 		miocnak(q, mp, 0, EINVAL);
 		return;
 	}
-	mh = dsp->ds_mh;
-	ASSERT(mh != NULL);
-	mac_ioctl(mh, q, mp);
+	mac_ioctl(dsp->ds_mh, q, mp);
 }
diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c
index 2002e994bf..064217c8f2 100644
--- a/usr/src/uts/common/io/dls/dls.c
+++ b/usr/src/uts/common/io/dls/dls.c
@@ -23,583 +23,285 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Data-Link Services Module
  */
 
-#include	<sys/types.h>
-#include	<sys/stream.h>
 #include	<sys/strsun.h>
-#include	<sys/sysmacros.h>
-#include	<sys/atomic.h>
-#include	<sys/stat.h>
-#include	<sys/dlpi.h>
 #include	<sys/vlan.h>
-#include	<sys/ethernet.h>
-#include	<sys/byteorder.h>
-#include	<sys/mac.h>
-
-#include	<sys/dls.h>
-#include	<sys/dls_impl.h>
-#include	<sys/dls_soft_ring.h>
-
-static kmem_cache_t	*i_dls_impl_cachep;
-static uint32_t		i_dls_impl_count;
-
-static kstat_t	*dls_ksp = (kstat_t *)NULL;
-struct dls_kstats dls_kstat =
-{
-	{ "soft_ring_pkt_drop", KSTAT_DATA_UINT32 },
-};
-
-static int dls_open(dls_vlan_t *, dls_dl_handle_t ddh, dls_channel_t *);
-
-/*
- * Private functions.
- */
-
-/*ARGSUSED*/
-static int
-i_dls_constructor(void *buf, void *arg, int kmflag)
-{
-	dls_impl_t	*dip = buf;
-
-	bzero(buf, sizeof (dls_impl_t));
-
-	rw_init(&(dip->di_lock), NULL, RW_DRIVER, NULL);
-	return (0);
-}
-
-/*ARGSUSED*/
-static void
-i_dls_destructor(void *buf, void *arg)
-{
-	dls_impl_t	*dip = buf;
-
-	ASSERT(dip->di_dvp == NULL);
-	ASSERT(dip->di_mnh == NULL);
-	ASSERT(dip->di_dmap == NULL);
-	ASSERT(!dip->di_local);
-	ASSERT(!dip->di_bound);
-	ASSERT(dip->di_rx == NULL);
-	ASSERT(dip->di_txinfo == NULL);
-
-	rw_destroy(&(dip->di_lock));
-}
-
-static void
-i_dls_notify(void *arg, mac_notify_type_t type)
-{
-	dls_impl_t		*dip = arg;
-
-	switch (type) {
-	case MAC_NOTE_UNICST:
-		mac_unicst_get(dip->di_mh, dip->di_unicst_addr);
-		break;
-
-	case MAC_NOTE_PROMISC:
-	case MAC_NOTE_VNIC:
-		/*
-		 * Every time the MAC interface changes promiscuity or
-		 * the VNIC characteristics change we need to reset
-		 * our transmit information.
-		 */
-		dip->di_txinfo = mac_tx_get(dip->di_mh);
-		break;
-	}
-}
-
-static void
-dls_stat_init()
-{
-	if ((dls_ksp = kstat_create("dls", 0, "dls_stat",
-	    "net", KSTAT_TYPE_NAMED,
-	    sizeof (dls_kstat) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_VIRTUAL)) == NULL) {
-		cmn_err(CE_WARN,
-		"DLS: failed to create kstat structure for dls stats");
-		return;
-	}
-	dls_ksp->ks_data = (void *)&dls_kstat;
-	kstat_install(dls_ksp);
-}
-
-static void
-dls_stat_destroy()
-{
-	kstat_delete(dls_ksp);
-}
-
-/*
- * Module initialization functions.
- */
-
-void
-dls_init(void)
-{
-	/*
-	 * Create a kmem_cache of dls_impl_t.
-	 */
-	i_dls_impl_cachep = kmem_cache_create("dls_cache",
-	    sizeof (dls_impl_t), 0, i_dls_constructor, i_dls_destructor, NULL,
-	    NULL, NULL, 0);
-	ASSERT(i_dls_impl_cachep != NULL);
-	soft_ring_init();
-	dls_stat_init();
-}
+#include	<sys/dld_impl.h>
 
 int
-dls_fini(void)
+dls_open(dls_link_t *dlp, dls_dl_handle_t ddh, dld_str_t *dsp)
 {
-	/*
-	 * If there are any dls_impl_t in use then return EBUSY.
-	 */
-	if (i_dls_impl_count != 0)
-		return (EBUSY);
-
-	/*
-	 * Destroy the kmem_cache.
-	 */
-	kmem_cache_destroy(i_dls_impl_cachep);
-	dls_stat_destroy();
-	return (0);
-}
-
-/*
- * Client functions.
- */
-
-/*
- * /dev node style-2 VLAN PPA access. This might result in a newly created
- * dls_vlan_t. Note that this dls_vlan_t is different from others, in that
- * this VLAN might not have a link name that is managed by dlmgmtd (we cannot
- * use its VLAN ppa hack name as it might conflict with a vanity name).
- */
-int
-dls_open_style2_vlan(major_t major, uint_t ppa, dls_channel_t *dcp)
-{
-	dev_t		dev = makedevice(major, DLS_PPA2INST(ppa) + 1);
-	uint_t		vid = DLS_PPA2VID(ppa);
-	dls_vlan_t	*lndvp, *dvp;
-	int		err;
-
-	/*
-	 * First find the dls_vlan_t this VLAN is created on. This must be
-	 * a GLDv3 driver based device.
-	 */
-	if ((err = dls_vlan_hold_by_dev(dev, &lndvp)) != 0)
-		return (err);
-
-	if (vid > VLAN_ID_MAX)
-		return (ENOENT);
-
-	err = dls_vlan_hold(lndvp->dv_dlp->dl_name, vid, &dvp, B_FALSE, B_TRUE);
-	if (err != 0)
-		goto done;
-
-	if ((err = dls_open(dvp, NULL, dcp)) != 0)
-		dls_vlan_rele(dvp);
-
-done:
-	dls_vlan_rele(lndvp);
-	return (err);
-}
-
-int
-dls_open_by_dev(dev_t dev, dls_channel_t *dcp)
-{
-	dls_dl_handle_t	ddh;
-	dls_vlan_t	*dvp;
-	int		err;
-
-	/*
-	 * Get a reference to the given dls_vlan_t.
-	 */
-	if ((err = dls_devnet_open_by_dev(dev, &dvp, &ddh)) != 0)
-		return (err);
-
-	if ((err = dls_open(dvp, ddh, dcp)) != 0) {
-		if (ddh != NULL)
-			dls_devnet_close(ddh);
-		else
-			dls_vlan_rele(dvp);
-	}
-
-	return (err);
-}
-
-static int
-dls_open(dls_vlan_t *dvp, dls_dl_handle_t ddh, dls_channel_t *dcp)
-{
-	dls_impl_t	*dip;
-	dls_link_t	*dlp;
-	int		err;
 	zoneid_t	zid = getzoneid();
 	boolean_t	local;
 
 	/*
-	 * Check whether this client belongs to the zone of this dvp. Note that
-	 * a global zone client is allowed to open a local zone dvp.
+	 * Check whether this client belongs to the zone of this dlp. Note that
+	 * a global zone client is allowed to open a local zone dlp.
 	 */
-	mutex_enter(&dvp->dv_lock);
-	if (zid != GLOBAL_ZONEID && dvp->dv_zid != zid) {
-		mutex_exit(&dvp->dv_lock);
+	if (zid != GLOBAL_ZONEID && dlp->dl_zid != zid)
 		return (ENOENT);
-	}
-	local = (zid == dvp->dv_zid);
-	dvp->dv_zone_ref += (local ? 1 : 0);
-	mutex_exit(&dvp->dv_lock);
-
-	dlp = dvp->dv_dlp;
-	if ((err = mac_start(dlp->dl_mh)) != 0) {
-		mutex_enter(&dvp->dv_lock);
-		dvp->dv_zone_ref -= (local ? 1 : 0);
-		mutex_exit(&dvp->dv_lock);
-		return (err);
-	}
 
-	/*
-	 * Allocate a new dls_impl_t.
-	 */
-	dip = kmem_cache_alloc(i_dls_impl_cachep, KM_SLEEP);
-	dip->di_dvp = dvp;
-	dip->di_ddh = ddh;
+	local = (zid == dlp->dl_zid);
+	dlp->dl_zone_ref += (local ? 1 : 0);
 
 	/*
 	 * Cache a copy of the MAC interface handle, a pointer to the
-	 * immutable MAC info and a copy of the current MAC address.
+	 * immutable MAC info.
 	 */
-	dip->di_mh = dlp->dl_mh;
-	dip->di_mip = dlp->dl_mip;
+	dsp->ds_dlp = dlp;
+	dsp->ds_mh = dlp->dl_mh;
+	dsp->ds_mch = dlp->dl_mch;
+	dsp->ds_mip = dlp->dl_mip;
+	dsp->ds_ddh = ddh;
+	dsp->ds_local = local;
 
-	mac_unicst_get(dip->di_mh, dip->di_unicst_addr);
-
-	/*
-	 * Set the MAC transmit information.
-	 */
-	dip->di_txinfo = mac_tx_get(dip->di_mh);
-
-	/*
-	 * Add a notification function so that we get updates from
-	 * the MAC.
-	 */
-	dip->di_mnh = mac_notify_add(dip->di_mh, i_dls_notify,
-	    (void *)dip);
-
-	/*
-	 * Bump the kmem_cache count to make sure it is not prematurely
-	 * destroyed.
-	 */
-	atomic_add_32(&i_dls_impl_count, 1);
-
-	dip->di_local = local;
-
-	/*
-	 * Hand back a reference to the dls_impl_t.
-	 */
-	*dcp = (dls_channel_t)dip;
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
 	return (0);
 }
 
 void
-dls_close(dls_channel_t dc)
+dls_close(dld_str_t *dsp)
 {
-	dls_impl_t		*dip = (dls_impl_t *)dc;
-	dls_vlan_t		*dvp = dip->di_dvp;
-	dls_link_t		*dlp = dvp->dv_dlp;
+	dls_link_t		*dlp = dsp->ds_dlp;
 	dls_multicst_addr_t	*p;
 	dls_multicst_addr_t	*nextp;
-	dls_dl_handle_t		ddh = dip->di_ddh;
+	uint32_t		old_flags;
 
-	if (dip->di_local) {
-		mutex_enter(&dvp->dv_lock);
-		dvp->dv_zone_ref--;
-		mutex_exit(&dvp->dv_lock);
-	}
-	dip->di_local = B_FALSE;
+	ASSERT(dsp->ds_datathr_cnt == 0);
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
 
-	dls_active_clear(dc);
+	if (dsp->ds_local)
+		dlp->dl_zone_ref--;
+	dsp->ds_local = B_FALSE;
 
-	rw_enter(&(dip->di_lock), RW_WRITER);
 	/*
-	 * Remove the notify function.
+	 * Walk the list of multicast addresses, disabling each at the MAC.
+	 * Note that we must remove multicast address before
+	 * mac_unicast_remove() (called by dls_active_clear()) because
+	 * mac_multicast_remove() relies on the unicast flows on the mac
+	 * client.
 	 */
-	mac_notify_remove(dip->di_mh, dip->di_mnh);
-	dip->di_mnh = NULL;
-
-	/*
-	 * If the dls_impl_t is bound then unbind it.
-	 */
-	if (dip->di_bound) {
-		rw_exit(&(dip->di_lock));
-		dls_link_remove(dlp, dip);
-		rw_enter(&(dip->di_lock), RW_WRITER);
-		dip->di_bound = B_FALSE;
-	}
-
-	/*
-	 * Walk the list of multicast addresses, disabling each at
-	 * the MAC.
-	 */
-	for (p = dip->di_dmap; p != NULL; p = nextp) {
-		(void) mac_multicst_remove(dip->di_mh, p->dma_addr);
+	for (p = dsp->ds_dmap; p != NULL; p = nextp) {
+		(void) mac_multicast_remove(dsp->ds_mch, p->dma_addr);
 		nextp = p->dma_nextp;
 		kmem_free(p, sizeof (dls_multicst_addr_t));
 	}
-	dip->di_dmap = NULL;
+	dsp->ds_dmap = NULL;
 
-	dip->di_rx = NULL;
-	dip->di_rx_arg = NULL;
-	rw_exit(&(dip->di_lock));
+	dls_active_clear(dsp);
 
 	/*
-	 * If the MAC has been set in promiscuous mode then disable it.
+	 * If the dld_str_t is bound then unbind it.
 	 */
-	(void) dls_promisc(dc, 0);
-	dip->di_txinfo = NULL;
+	if (dsp->ds_dlstate == DL_IDLE) {
+		(void) dls_unbind(dsp);
+		dsp->ds_dlstate = DL_UNBOUND;
+	}
 
 	/*
-	 * Free the dls_impl_t back to the cache.
+	 * If the MAC has been set in promiscuous mode then disable it.
+	 * This needs to be done before resetting ds_rx.
 	 */
-	dip->di_txinfo = NULL;
-
-	if (dip->di_soft_ring_list != NULL) {
-		soft_ring_set_destroy(dip->di_soft_ring_list,
-		    dip->di_soft_ring_size);
-		dip->di_soft_ring_list = NULL;
-	}
-	dip->di_soft_ring_size = 0;
+	old_flags = dsp->ds_promisc;
+	dsp->ds_promisc = 0;
+	(void) dls_promisc(dsp, old_flags);
 
 	/*
-	 * Decrement the reference count to allow the cache to be destroyed
-	 * if there are no more dls_impl_t.
+	 * At this point we have cutoff inbound packet flow from the mac
+	 * for this 'dsp'. The dls_link_remove above cut off packets meant
+	 * for us and waited for upcalls to finish. Similarly the dls_promisc
+	 * reset above waited for promisc callbacks to finish. Now we can
+	 * safely reset ds_rx to NULL
 	 */
-	atomic_add_32(&i_dls_impl_count, -1);
-
-	dip->di_dvp = NULL;
+	dsp->ds_rx = NULL;
+	dsp->ds_rx_arg = NULL;
 
-	kmem_cache_free(i_dls_impl_cachep, dip);
-
-	mac_stop(dvp->dv_dlp->dl_mh);
+	dsp->ds_dlp = NULL;
 
 	/*
-	 * Release our reference to the dls_vlan_t allowing that to be
-	 * destroyed if there are no more dls_impl_t. An unreferenced tagged
-	 * (non-persistent) vlan gets destroyed automatically.
+	 * Release our reference to the dls_link_t allowing that to be
+	 * destroyed if there are no more dls_impl_t.
 	 */
-	if (ddh != NULL)
-		dls_devnet_close(ddh);
-	else
-		dls_vlan_rele(dvp);
-}
-
-mac_handle_t
-dls_mac(dls_channel_t dc)
-{
-	return (((dls_impl_t *)dc)->di_mh);
-}
-
-uint16_t
-dls_vid(dls_channel_t dc)
-{
-	return (((dls_impl_t *)dc)->di_dvp->dv_id);
+	dls_link_rele(dlp);
 }
 
 int
-dls_bind(dls_channel_t dc, uint32_t sap)
+dls_bind(dld_str_t *dsp, uint32_t sap)
 {
-	dls_impl_t	*dip = (dls_impl_t *)dc;
-	dls_link_t	*dlp;
 	uint32_t	dls_sap;
 
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
 	/*
 	 * Check to see the value is legal for the media type.
 	 */
-	if (!mac_sap_verify(dip->di_mh, sap, &dls_sap))
+	if (!mac_sap_verify(dsp->ds_mh, sap, &dls_sap))
 		return (EINVAL);
-	if (dip->di_promisc & DLS_PROMISC_SAP)
+
+	if (dsp->ds_promisc & DLS_PROMISC_SAP)
 		dls_sap = DLS_SAP_PROMISC;
 
 	/*
-	 * Set up the dls_impl_t to mark it as able to receive packets.
+	 * Set up the dld_str_t to mark it as able to receive packets.
 	 */
-	rw_enter(&(dip->di_lock), RW_WRITER);
-	ASSERT(!dip->di_bound);
-	dip->di_sap = sap;
-	dip->di_bound = B_TRUE;
-	rw_exit(&(dip->di_lock));
+	dsp->ds_sap = sap;
 
 	/*
-	 * Now bind the dls_impl_t by adding it into the hash table in the
-	 * dls_link_t.
+	 * The MAC layer does the VLAN demultiplexing and will only pass up
+	 * untagged packets to non-promiscuous primary MAC clients. In order to
+	 * support the binding to the VLAN SAP which is required by DLPI, dls
+	 * needs to get a copy of all tagged packets when the client binds to
+	 * the VLAN SAP. We do this by registering a separate promiscuous
+	 * callback for each dls client binding to that SAP.
 	 *
-	 * NOTE: This must be done without the dls_impl_t lock being held
-	 *	 otherwise deadlock may ensue.
-	 */
-	dlp = dip->di_dvp->dv_dlp;
-	dls_link_add(dlp, dls_sap, dip);
+	 * Note: even though there are two promiscuous handles in dld_str_t,
+	 * ds_mph is for the regular promiscuous mode, ds_vlan_mph is the handle
+	 * to receive VLAN pkt when promiscuous mode is not on. Only one of
+	 * them can be non-NULL at the same time, to avoid receiving dup copies
+	 * of pkts.
+	 */
+	if (sap == ETHERTYPE_VLAN && dsp->ds_promisc == 0) {
+		int err;
+
+		if (dsp->ds_vlan_mph != NULL)
+			return (EINVAL);
+		err = mac_promisc_add(dsp->ds_mch,
+		    MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp,
+		    &dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS);
+		return (err);
+	}
 
+	/*
+	 * Now bind the dld_str_t by adding it into the hash table in the
+	 * dls_link_t.
+	 */
+	dls_link_add(dsp->ds_dlp, dls_sap, dsp);
 	return (0);
 }
 
-void
-dls_unbind(dls_channel_t dc)
+int
+dls_unbind(dld_str_t *dsp)
 {
-	dls_impl_t	*dip = (dls_impl_t *)dc;
-	dls_link_t	*dlp;
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
 
 	/*
-	 * Unbind the dls_impl_t by removing it from the hash table in the
-	 * dls_link_t.
-	 *
-	 * NOTE: This must be done without the dls_impl_t lock being held
-	 *	 otherise deadlock may enuse.
+	 * For VLAN SAP, there was a promisc handle registered when dls_bind.
+	 * When unbind this dls link, we need to remove the promisc handle.
+	 * See comments in dls_bind().
 	 */
-	dlp = dip->di_dvp->dv_dlp;
-	dls_link_remove(dlp, dip);
+	if (dsp->ds_vlan_mph != NULL) {
+		int err;
+
+		err = mac_promisc_remove(dsp->ds_vlan_mph);
+		ASSERT(err == 0);
+		dsp->ds_vlan_mph = NULL;
+		return (err);
+	}
 
 	/*
-	 * Mark the dls_impl_t as unable to receive packets This will make
-	 * sure that 'receives in flight' will not come our way.
+	 * Unbind the dld_str_t by removing it from the hash table in the
+	 * dls_link_t.
 	 */
-	dip->di_bound = B_FALSE;
+	dls_link_remove(dsp->ds_dlp, dsp);
+	dsp->ds_sap = 0;
+	return (0);
 }
 
 int
-dls_promisc(dls_channel_t dc, uint32_t flags)
+dls_promisc(dld_str_t *dsp, uint32_t old_flags)
 {
-	dls_impl_t	*dip = (dls_impl_t *)dc;
-	dls_link_t	*dlp;
 	int		err = 0;
 
-	ASSERT(!(flags & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI |
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+	ASSERT(!(dsp->ds_promisc & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI |
 	    DLS_PROMISC_PHYS)));
 
-	/*
-	 * Check if we need to turn on 'all sap' mode.
-	 */
-	rw_enter(&(dip->di_lock), RW_WRITER);
-	dlp = dip->di_dvp->dv_dlp;
-	if ((flags & DLS_PROMISC_SAP) &&
-	    !(dip->di_promisc & DLS_PROMISC_SAP)) {
-		dip->di_promisc |= DLS_PROMISC_SAP;
-		if (!dip->di_bound)
-			goto multi;
-
-		rw_exit(&(dip->di_lock));
-		dls_link_remove(dlp, dip);
-		dls_link_add(dlp, DLS_SAP_PROMISC, dip);
-		rw_enter(&(dip->di_lock), RW_WRITER);
-		goto multi;
-	}
-
-	/*
-	 * Check if we need to turn off 'all sap' mode.
-	 */
-	if (!(flags & DLS_PROMISC_SAP) &&
-	    (dip->di_promisc & DLS_PROMISC_SAP)) {
-		uint32_t dls_sap;
-
-		dip->di_promisc &= ~DLS_PROMISC_SAP;
-		if (!dip->di_bound)
-			goto multi;
-
-		rw_exit(&(dip->di_lock));
-		dls_link_remove(dlp, dip);
-		(void) mac_sap_verify(dip->di_mh, dip->di_sap, &dls_sap);
-		dls_link_add(dlp, dls_sap, dip);
-		rw_enter(&(dip->di_lock), RW_WRITER);
-	}
-
-multi:
-	/*
-	 * It's easiest to add the txloop callback up-front; if promiscuous
-	 * mode cannot be enabled, then we'll remove it before returning.
-	 * Use dl_promisc_lock to prevent racing with another thread also
-	 * manipulating the promiscuous state on another dls_impl_t associated
-	 * with the same dls_link_t.
-	 */
-	mutex_enter(&dlp->dl_promisc_lock);
-	if ((dlp->dl_npromisc == 0) && (flags & DLS_PROMISC_PHYS)) {
-		ASSERT(dlp->dl_mth == NULL);
-		dlp->dl_mth = mac_txloop_add(dlp->dl_mh, dls_link_txloop, dlp);
-	}
-
-	/*
-	 * Turn on or off 'all multicast' mode, if necessary.
-	 */
-	if (flags & DLS_PROMISC_MULTI) {
-		if (!(dip->di_promisc & DLS_PROMISC_MULTI)) {
-			if ((err = mac_promisc_set(dip->di_mh, B_TRUE,
-			    MAC_DEVPROMISC)) != 0) {
-				goto done;
-			}
-			dip->di_promisc |= DLS_PROMISC_MULTI;
-		}
-	} else {
-		if (dip->di_promisc & DLS_PROMISC_MULTI) {
-			if ((err = mac_promisc_set(dip->di_mh, B_FALSE,
-			    MAC_DEVPROMISC)) != 0) {
-				goto done;
-			}
-			dip->di_promisc &= ~DLS_PROMISC_MULTI;
-		}
-	}
-
-	/*
-	 * Turn on or off 'all physical' mode, if necessary.
-	 */
-	if (flags & DLS_PROMISC_PHYS) {
-		if (!(dip->di_promisc & DLS_PROMISC_PHYS)) {
-			err = mac_promisc_set(dip->di_mh, B_TRUE, MAC_PROMISC);
-			if (err != 0)
-				goto done;
-
-			dip->di_promisc |= DLS_PROMISC_PHYS;
-			dlp->dl_npromisc++;
+	if (old_flags == 0 && dsp->ds_promisc != 0) {
+		/*
+		 * If only DLS_PROMISC_SAP, we don't turn on the
+		 * physical promisc mode
+		 */
+		err = mac_promisc_add(dsp->ds_mch, MAC_CLIENT_PROMISC_ALL,
+		    dls_rx_promisc, dsp, &dsp->ds_mph,
+		    (dsp->ds_promisc != DLS_PROMISC_SAP) ? 0 :
+		    MAC_PROMISC_FLAGS_NO_PHYS);
+		if (err != 0)
+			return (err);
+
+		/* Remove vlan promisc handle to avoid sending dup copy up */
+		if (dsp->ds_vlan_mph != NULL) {
+			err = mac_promisc_remove(dsp->ds_vlan_mph);
+			dsp->ds_vlan_mph = NULL;
 		}
-	} else {
-		if (dip->di_promisc & DLS_PROMISC_PHYS) {
-			err = mac_promisc_set(dip->di_mh, B_FALSE, MAC_PROMISC);
-			if (err != 0)
-				goto done;
-
-			dip->di_promisc &= ~DLS_PROMISC_PHYS;
-			dlp->dl_npromisc--;
+	} else if (old_flags != 0 && dsp->ds_promisc == 0) {
+		ASSERT(dsp->ds_mph != NULL);
+		err = mac_promisc_remove(dsp->ds_mph);
+		/*
+		 * The failure only relates to resetting the device promiscuity
+		 * The mac layer does not fail in freeing up the promiscuous
+		 * data structures, and so we clear the ds_mph. The dld stream
+		 * may be closing and we can't fail that.
+		 */
+		dsp->ds_mph = NULL;
+		if (err != 0)
+			return (err);
+
+		if (dsp->ds_sap == ETHERTYPE_VLAN &&
+		    dsp->ds_dlstate != DL_UNBOUND) {
+			int err;
+
+			if (dsp->ds_vlan_mph != NULL)
+				return (EINVAL);
+			err = mac_promisc_add(dsp->ds_mch,
+			    MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp,
+			    &dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS);
+			return (err);
 		}
+	} else if (old_flags == DLS_PROMISC_SAP && dsp->ds_promisc != 0 &&
+	    dsp->ds_promisc != old_flags) {
+		/*
+		 * If the old flag is PROMISC_SAP, but the current flag has
+		 * changed to some new non-zero value, we need to turn the
+		 * physical promiscuous mode.
+		 */
+		ASSERT(dsp->ds_mph != NULL);
+		err = mac_promisc_remove(dsp->ds_mph);
+		if (err != 0)
+			return (err);
+		err = mac_promisc_add(dsp->ds_mch, MAC_CLIENT_PROMISC_ALL,
+		    dls_rx_promisc, dsp, &dsp->ds_mph, 0);
 	}
 
-done:
-	if (dlp->dl_npromisc == 0 && dlp->dl_mth != NULL) {
-		mac_txloop_remove(dlp->dl_mh, dlp->dl_mth);
-		dlp->dl_mth = NULL;
-	}
-
-	ASSERT(dlp->dl_npromisc == 0 || dlp->dl_mth != NULL);
-	mutex_exit(&dlp->dl_promisc_lock);
-
-	rw_exit(&(dip->di_lock));
 	return (err);
 }
 
 int
-dls_multicst_add(dls_channel_t dc, const uint8_t *addr)
+dls_multicst_add(dld_str_t *dsp, const uint8_t *addr)
 {
-	dls_impl_t		*dip = (dls_impl_t *)dc;
 	int			err;
 	dls_multicst_addr_t	**pp;
 	dls_multicst_addr_t	*p;
 	uint_t			addr_length;
 
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
 	/*
 	 * Check whether the address is in the list of enabled addresses for
-	 * this dls_impl_t.
+	 * this dld_str_t.
+	 */
+	addr_length = dsp->ds_mip->mi_addr_length;
+
+	/*
+	 * Protect against concurrent access of ds_dmap by data threads using
+	 * ds_rw_lock. The mac perimeter serializes the dls_multicst_add and
+	 * remove operations. Dropping the ds_rw_lock across mac calls is thus
+	 * ok and is also required by the locking protocol.
 	 */
-	rw_enter(&(dip->di_lock), RW_WRITER);
-	addr_length = dip->di_mip->mi_addr_length;
-	for (pp = &(dip->di_dmap); (p = *pp) != NULL; pp = &(p->dma_nextp)) {
+	rw_enter(&dsp->ds_rw_lock, RW_WRITER);
+	for (pp = &(dsp->ds_dmap); (p = *pp) != NULL; pp = &(p->dma_nextp)) {
 		if (bcmp(addr, p->dma_addr, addr_length) == 0) {
 			/*
 			 * It is there so there's nothing to do.
@@ -610,92 +312,92 @@ dls_multicst_add(dls_channel_t dc, const uint8_t *addr)
 	}
 
 	/*
-	 * Allocate a new list item.
+	 * Allocate a new list item and add it to the list.
 	 */
-	if ((p = kmem_zalloc(sizeof (dls_multicst_addr_t),
-	    KM_NOSLEEP)) == NULL) {
-		err = ENOMEM;
-		goto done;
-	}
+	p = kmem_zalloc(sizeof (dls_multicst_addr_t), KM_SLEEP);
+	bcopy(addr, p->dma_addr, addr_length);
+	*pp = p;
+	rw_exit(&dsp->ds_rw_lock);
 
 	/*
 	 * Enable the address at the MAC.
 	 */
-	if ((err = mac_multicst_add(dip->di_mh, addr)) != 0) {
-		kmem_free(p, sizeof (dls_multicst_addr_t));
-		goto done;
-	}
-
-	/*
-	 * The address is now enabled at the MAC so add it to the list.
-	 */
-	bcopy(addr, p->dma_addr, addr_length);
-	*pp = p;
+	err = mac_multicast_add(dsp->ds_mch, addr);
+	if (err == 0)
+		return (0);
 
+	/* Undo the operation as it has failed */
+	rw_enter(&dsp->ds_rw_lock, RW_WRITER);
+	ASSERT(*pp == p && p->dma_nextp == NULL);
+	*pp = NULL;
+	kmem_free(p, sizeof (dls_multicst_addr_t));
 done:
-	rw_exit(&(dip->di_lock));
+	rw_exit(&dsp->ds_rw_lock);
 	return (err);
 }
 
 int
-dls_multicst_remove(dls_channel_t dc, const uint8_t *addr)
+dls_multicst_remove(dld_str_t *dsp, const uint8_t *addr)
 {
-	dls_impl_t		*dip = (dls_impl_t *)dc;
-	int			err;
 	dls_multicst_addr_t	**pp;
 	dls_multicst_addr_t	*p;
 	uint_t			addr_length;
 
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
 	/*
 	 * Find the address in the list of enabled addresses for this
-	 * dls_impl_t.
+	 * dld_str_t.
 	 */
-	rw_enter(&(dip->di_lock), RW_WRITER);
-	addr_length = dip->di_mip->mi_addr_length;
-	for (pp = &(dip->di_dmap); (p = *pp) != NULL; pp = &(p->dma_nextp)) {
+	addr_length = dsp->ds_mip->mi_addr_length;
+
+	/*
+	 * Protect against concurrent access to ds_dmap by data threads using
+	 * ds_rw_lock. The mac perimeter serializes the dls_multicst_add and
+	 * remove operations. Dropping the ds_rw_lock across mac calls is thus
+	 * ok and is also required by the locking protocol.
+	 */
+	rw_enter(&dsp->ds_rw_lock, RW_WRITER);
+	for (pp = &(dsp->ds_dmap); (p = *pp) != NULL; pp = &(p->dma_nextp)) {
 		if (bcmp(addr, p->dma_addr, addr_length) == 0)
 			break;
 	}
 
 	/*
 	 * If we walked to the end of the list then the given address is
-	 * not currently enabled for this dls_impl_t.
+	 * not currently enabled for this dld_str_t.
 	 */
 	if (p == NULL) {
-		err = ENOENT;
-		goto done;
+		rw_exit(&dsp->ds_rw_lock);
+		return (ENOENT);
 	}
 
 	/*
-	 * Disable the address at the MAC.
+	 * Remove the address from the list.
 	 */
-	if ((err = mac_multicst_remove(dip->di_mh, addr)) != 0)
-		goto done;
+	*pp = p->dma_nextp;
+	rw_exit(&dsp->ds_rw_lock);
 
 	/*
-	 * Remove the address from the list.
+	 * Disable the address at the MAC.
 	 */
-	*pp = p->dma_nextp;
+	mac_multicast_remove(dsp->ds_mch, addr);
 	kmem_free(p, sizeof (dls_multicst_addr_t));
-
-done:
-	rw_exit(&(dip->di_lock));
-	return (err);
+	return (0);
 }
 
 mblk_t *
-dls_header(dls_channel_t dc, const uint8_t *addr, uint16_t sap, uint_t pri,
+dls_header(dld_str_t *dsp, const uint8_t *addr, uint16_t sap, uint_t pri,
     mblk_t **payloadp)
 {
-	dls_impl_t *dip = (dls_impl_t *)dc;
 	uint16_t vid;
 	size_t extra_len;
 	uint16_t mac_sap;
 	mblk_t *mp, *payload;
-	boolean_t is_ethernet = (dip->di_mip->mi_media == DL_ETHER);
+	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
 	struct ether_vlan_header *evhp;
 
-	vid = dip->di_dvp->dv_id;
+	vid = mac_client_vid(dsp->ds_mch);
 	payload = (payloadp == NULL) ? NULL : (*payloadp);
 
 	/*
@@ -719,7 +421,7 @@ dls_header(dls_channel_t dc, const uint8_t *addr, uint16_t sap, uint_t pri,
 		mac_sap = sap;
 	}
 
-	mp = mac_header(dip->di_mh, addr, mac_sap, payload, extra_len);
+	mp = mac_header(dsp->ds_mh, addr, mac_sap, payload, extra_len);
 	if (mp == NULL)
 		return (NULL);
 
@@ -772,209 +474,207 @@ dls_header(dls_channel_t dc, const uint8_t *addr, uint16_t sap, uint_t pri,
 	return (mp);
 }
 
-int
-dls_header_info(dls_channel_t dc, mblk_t *mp, mac_header_info_t *mhip)
-{
-	return (dls_link_header_info(((dls_impl_t *)dc)->di_dvp->dv_dlp,
-	    mp, mhip));
-}
-
 void
-dls_rx_set(dls_channel_t dc, dls_rx_t rx, void *arg)
-{
-	dls_impl_t	*dip = (dls_impl_t *)dc;
-
-	rw_enter(&(dip->di_lock), RW_WRITER);
-	dip->di_rx = rx;
-	dip->di_rx_arg = arg;
-	rw_exit(&(dip->di_lock));
-}
-
-mblk_t *
-dls_tx(dls_channel_t dc, mblk_t *mp)
+dls_rx_set(dld_str_t *dsp, dls_rx_t rx, void *arg)
 {
-	const mac_txinfo_t *mtp = ((dls_impl_t *)dc)->di_txinfo;
-
-	return (mtp->mt_fn(mtp->mt_arg, mp));
+	mutex_enter(&dsp->ds_lock);
+	dsp->ds_rx = rx;
+	dsp->ds_rx_arg = arg;
+	mutex_exit(&dsp->ds_lock);
 }
 
-boolean_t
-dls_accept(dls_impl_t *dip, mac_header_info_t *mhip, dls_rx_t *di_rx,
-    void **di_rx_arg)
+static boolean_t
+dls_accept_common(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx,
+    void **ds_rx_arg, boolean_t promisc, boolean_t promisc_loopback)
 {
 	dls_multicst_addr_t	*dmap;
-	size_t			addr_length = dip->di_mip->mi_addr_length;
+	size_t			addr_length = dsp->ds_mip->mi_addr_length;
 
 	/*
-	 * We must not accept packets if the dls_impl_t is not marked as bound
+	 * We must not accept packets if the dld_str_t is not marked as bound
 	 * or is being removed.
 	 */
-	rw_enter(&(dip->di_lock), RW_READER);
-	if (!dip->di_bound || dip->di_removing)
+	if (dsp->ds_dlstate != DL_IDLE)
 		goto refuse;
 
-	/*
-	 * If the dls_impl_t is in 'all physical' mode then always accept.
-	 */
-	if (dip->di_promisc & DLS_PROMISC_PHYS)
-		goto accept;
+	if (dsp->ds_promisc != 0) {
+		/*
+		 * Filter out packets that arrived from the data path
+		 * (i_dls_link_rx) when promisc mode is on.
+		 */
+		if (!promisc)
+			goto refuse;
+		/*
+		 * If the dls_impl_t is in 'all physical' mode then
+		 * always accept.
+		 */
+		if (dsp->ds_promisc & DLS_PROMISC_PHYS)
+			goto accept;
 
-	/*
-	 * For non-promiscs-phys streams, filter out the packets looped back
-	 * from the underlying driver because of promiscuous setting.
-	 */
-	if (mhip->mhi_prom_looped)
-		goto refuse;
+		/*
+		 * Loopback packets i.e. packets sent out by DLS on a given
+		 * mac end point, will be accepted back by DLS on loopback
+		 * from the mac, only in the 'all physical' mode which has been
+		 * covered by the previous check above
+		 */
+		if (promisc_loopback)
+			goto refuse;
+	}
 
 	switch (mhip->mhi_dsttype) {
 	case MAC_ADDRTYPE_UNICAST:
+	case MAC_ADDRTYPE_BROADCAST:
 		/*
-		 * Check to see if the destination address matches the
-		 * dls_impl_t unicast address.
+		 * We can accept unicast and broadcast packets because
+		 * filtering is already done by the mac layer.
 		 */
-		if (memcmp(mhip->mhi_daddr, dip->di_unicst_addr, addr_length) ==
-		    0) {
-			goto accept;
-		}
-		break;
+		goto accept;
 	case MAC_ADDRTYPE_MULTICAST:
 		/*
-		 * Check the address against the list of addresses enabled
-		 * for this dls_impl_t or accept it unconditionally if the
-		 * dls_impl_t is in 'all multicast' mode.
+		 * Additional filtering is needed for multicast addresses
+		 * because different streams may be interested in different
+		 * addresses.
 		 */
-		if (dip->di_promisc & DLS_PROMISC_MULTI)
+		if (dsp->ds_promisc & DLS_PROMISC_MULTI)
 			goto accept;
-		for (dmap = dip->di_dmap; dmap != NULL;
+
+		rw_enter(&dsp->ds_rw_lock, RW_READER);
+		for (dmap = dsp->ds_dmap; dmap != NULL;
 		    dmap = dmap->dma_nextp) {
 			if (memcmp(mhip->mhi_daddr, dmap->dma_addr,
 			    addr_length) == 0) {
+				rw_exit(&dsp->ds_rw_lock);
 				goto accept;
 			}
 		}
+		rw_exit(&dsp->ds_rw_lock);
 		break;
-	case MAC_ADDRTYPE_BROADCAST:
-		/*
-		 * If the address is broadcast then the dls_impl_t will
-		 * always accept it.
-		 */
-		goto accept;
 	}
 
 refuse:
-	rw_exit(&(dip->di_lock));
 	return (B_FALSE);
 
 accept:
 	/*
-	 * Since we hold di_lock here, the returned di_rx and di_rx_arg will
-	 * always be in sync.
+	 * the returned ds_rx and ds_rx_arg will always be in sync.
 	 */
-	*di_rx = dip->di_rx;
-	*di_rx_arg = dip->di_rx_arg;
-	rw_exit(&(dip->di_lock));
+	mutex_enter(&dsp->ds_lock);
+	*ds_rx = dsp->ds_rx;
+	*ds_rx_arg = dsp->ds_rx_arg;
+	mutex_exit(&dsp->ds_lock);
+
 	return (B_TRUE);
 }
 
 /* ARGSUSED */
 boolean_t
-dls_accept_loopback(dls_impl_t *dip, mac_header_info_t *mhip, dls_rx_t *di_rx,
-    void **di_rx_arg)
+dls_accept(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx,
+    void **ds_rx_arg)
 {
-	/*
-	 * We must not accept packets if the dls_impl_t is not marked as bound
-	 * or is being removed.
-	 */
-	rw_enter(&(dip->di_lock), RW_READER);
-	if (!dip->di_bound || dip->di_removing)
-		goto refuse;
-
-	/*
-	 * A dls_impl_t should only accept loopback packets if it is in
-	 * 'all physical' mode.
-	 */
-	if (dip->di_promisc & DLS_PROMISC_PHYS)
-		goto accept;
-
-refuse:
-	rw_exit(&(dip->di_lock));
-	return (B_FALSE);
-
-accept:
-	/*
-	 * Since we hold di_lock here, the returned di_rx and di_rx_arg will
-	 * always be in sync.
-	 */
-	*di_rx = dip->di_rx;
-	*di_rx_arg = dip->di_rx_arg;
-	rw_exit(&(dip->di_lock));
-	return (B_TRUE);
+	return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_FALSE,
+	    B_FALSE));
 }
 
 boolean_t
+dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx,
+    void **ds_rx_arg, boolean_t loopback)
+{
+	return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE,
+	    loopback));
+}
+
+int
 dls_mac_active_set(dls_link_t *dlp)
 {
-	mutex_enter(&dlp->dl_lock);
+	int err = 0;
 
 	/*
-	 * If this is the first active client on this link, notify
-	 * the mac that we're becoming an active client.
+	 * First client; add the primary unicast address.
 	 */
-	if (dlp->dl_nactive == 0 && !mac_active_shareable_set(dlp->dl_mh)) {
-		mutex_exit(&dlp->dl_lock);
-		return (B_FALSE);
+	if (dlp->dl_nactive == 0) {
+		/*
+		 * First client; add the primary unicast address.
+		 */
+		mac_diag_t diag;
+
+		/* request the primary MAC address */
+		if ((err = mac_unicast_primary_add(dlp->dl_mch, &dlp->dl_mah,
+		    &diag)) != 0) {
+			return (err);
+		}
+
+		/*
+		 * Set the function to start receiving packets.
+		 */
+		mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp);
+
+		/*
+		 * We've got a MAC client for this link now.
+		 * Push down the flows that were defined on this link
+		 * hitherto. The flows are added to the active flow table
+		 * and SRS, softrings etc. are created as needed.
+		 */
+		mac_link_init_flows(dlp->dl_mch);
 	}
 	dlp->dl_nactive++;
-	mutex_exit(&dlp->dl_lock);
-	return (B_TRUE);
+	return (0);
 }
 
 void
 dls_mac_active_clear(dls_link_t *dlp)
 {
-	mutex_enter(&dlp->dl_lock);
-	if (--dlp->dl_nactive == 0)
-		mac_active_clear(dlp->dl_mh);
-	mutex_exit(&dlp->dl_lock);
+	if (--dlp->dl_nactive == 0) {
+		ASSERT(dlp->dl_mah != NULL);
+		/*
+		 * We would have initialized subflows etc. only if we
+		 * brought up the primary client and set the unicast
+		 * unicast address etc. Deactivate the flows. The flow
+		 * entry will be removed from the active flow tables,
+		 * and the associated SRS, softrings etc will be
+		 * deleted. But the flow entry itself won't be
+		 * destroyed, instead it will continue to be
+		 * archived off the  the global flow hash list, for a
+		 * possible future activation when say
+		 * IP is plumbed again
+		 */
+
+		mac_link_release_flows(dlp->dl_mch);
+		(void) mac_unicast_remove(dlp->dl_mch, dlp->dl_mah);
+		dlp->dl_mah = NULL;
+		mac_rx_clear(dlp->dl_mch);
+	}
 }
 
-boolean_t
-dls_active_set(dls_channel_t dc)
+int
+dls_active_set(dld_str_t *dsp)
 {
-	dls_impl_t	*dip = (dls_impl_t *)dc;
-	dls_link_t	*dlp = dip->di_dvp->dv_dlp;
+	int err = 0;
 
-	rw_enter(&dip->di_lock, RW_WRITER);
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
 
 	/* If we're already active, then there's nothing more to do. */
-	if (dip->di_active) {
-		rw_exit(&dip->di_lock);
-		return (B_TRUE);
-	}
+	if (dsp->ds_active)
+		return (0);
 
-	if (!dls_mac_active_set(dlp)) {
-		rw_exit(&dip->di_lock);
-		return (B_FALSE);
+	if ((err = dls_mac_active_set(dsp->ds_dlp)) != 0) {
+		/* except for ENXIO all other errors are mapped to EBUSY */
+		if (err != ENXIO)
+			return (EBUSY);
+		return (err);
 	}
-	dip->di_active = B_TRUE;
-	rw_exit(&dip->di_lock);
-	return (B_TRUE);
+
+	dsp->ds_active = B_TRUE;
+	return (0);
 }
 
 void
-dls_active_clear(dls_channel_t dc)
+dls_active_clear(dld_str_t *dsp)
 {
-	dls_impl_t	*dip = (dls_impl_t *)dc;
-	dls_link_t	*dlp = dip->di_dvp->dv_dlp;
-
-	rw_enter(&dip->di_lock, RW_WRITER);
+	ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
 
-	if (!dip->di_active)
-		goto out;
-	dip->di_active = B_FALSE;
-
-	dls_mac_active_clear(dlp);
+	if (!dsp->ds_active)
+		return;
 
-out:
-	rw_exit(&dip->di_lock);
+	dls_mac_active_clear(dsp->ds_dlp);
+	dsp->ds_active = B_FALSE;
 }
diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c
index 759fb97f0a..852b87d24b 100644
--- a/usr/src/uts/common/io/dls/dls_link.c
+++ b/usr/src/uts/common/io/dls/dls_link.c
@@ -23,34 +23,21 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Data-Link Services Module
  */
 
-#include	<sys/types.h>
-#include	<sys/stream.h>
-#include	<sys/strsun.h>
-#include	<sys/strsubr.h>
 #include	<sys/sysmacros.h>
-#include	<sys/atomic.h>
-#include	<sys/modhash.h>
-#include	<sys/dlpi.h>
-#include	<sys/ethernet.h>
-#include	<sys/byteorder.h>
+#include	<sys/strsubr.h>
+#include	<sys/strsun.h>
 #include	<sys/vlan.h>
-#include	<sys/mac.h>
-#include	<sys/sdt.h>
-
-#include	<sys/dls.h>
 #include	<sys/dld_impl.h>
-#include	<sys/dls_impl.h>
+#include	<sys/sdt.h>
+#include	<sys/atomic.h>
 
 static kmem_cache_t	*i_dls_link_cachep;
 static mod_hash_t	*i_dls_link_hash;
 static uint_t		i_dls_link_count;
-static krwlock_t	i_dls_link_lock;
 
 #define		LINK_HASHSZ	67	/* prime */
 #define		IMPL_HASHSZ	67	/* prime */
@@ -58,15 +45,8 @@ static krwlock_t	i_dls_link_lock;
 /*
  * Construct a hash key encompassing both DLSAP value and VLAN idenitifier.
  */
-#define	MAKE_KEY(_sap, _vid)						\
-	((mod_hash_key_t)(uintptr_t)					\
-	(((_sap) << VLAN_ID_SIZE) | (_vid) & VLAN_ID_MASK))
-
-/*
- * Extract the DLSAP value from the hash key.
- */
-#define	KEY_SAP(_key)							\
-	(((uint32_t)(uintptr_t)(_key)) >> VLAN_ID_SIZE)
+#define	MAKE_KEY(_sap)						\
+	((mod_hash_key_t)(uintptr_t)((_sap) << VLAN_ID_SIZE))
 
 #define	DLS_STRIP_PADDING(pktsize, p) {			\
 	if (pktsize != 0) {				\
@@ -91,12 +71,9 @@ i_dls_link_constructor(void *buf, void *arg, int kmflag)
 	bzero(buf, sizeof (dls_link_t));
 
 	(void) snprintf(name, MAXNAMELEN, "dls_link_t_%p_hash", buf);
-	dlp->dl_impl_hash = mod_hash_create_idhash(name, IMPL_HASHSZ,
+	dlp->dl_str_hash = mod_hash_create_idhash(name, IMPL_HASHSZ,
 	    mod_hash_null_valdtor);
 
-	mutex_init(&dlp->dl_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&dlp->dl_promisc_lock, NULL, MUTEX_DEFAULT, NULL);
-	rw_init(&dlp->dl_impl_lock, NULL, RW_DEFAULT, NULL);
 	return (0);
 }
 
@@ -108,14 +85,12 @@ i_dls_link_destructor(void *buf, void *arg)
 
 	ASSERT(dlp->dl_ref == 0);
 	ASSERT(dlp->dl_mh == NULL);
+	ASSERT(dlp->dl_mah == NULL);
 	ASSERT(dlp->dl_unknowns == 0);
 
-	mod_hash_destroy_idhash(dlp->dl_impl_hash);
-	dlp->dl_impl_hash = NULL;
+	mod_hash_destroy_idhash(dlp->dl_str_hash);
+	dlp->dl_str_hash = NULL;
 
-	mutex_destroy(&dlp->dl_lock);
-	mutex_destroy(&dlp->dl_promisc_lock);
-	rw_destroy(&dlp->dl_impl_lock);
 }
 
 /*
@@ -195,8 +170,7 @@ i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip,
 		 */
 		if (memcmp(mhip->mhi_daddr, cmhi.mhi_daddr, addr_size) != 0 ||
 		    memcmp(mhip->mhi_saddr, cmhi.mhi_saddr, addr_size) != 0 ||
-		    mhip->mhi_bindsap != cmhi.mhi_bindsap ||
-		    mhip->mhi_prom_looped != cmhi.mhi_prom_looped) {
+		    mhip->mhi_bindsap != cmhi.mhi_bindsap) {
 			/*
 			 * Note that we don't need to restore the padding.
 			 */
@@ -239,16 +213,34 @@ i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip,
 	return (mp);
 }
 
-static void
-i_dls_head_hold(dls_head_t *dhp)
+/* ARGSUSED */
+static int
+i_dls_head_hold(mod_hash_key_t key, mod_hash_val_t val)
 {
-	atomic_inc_32(&dhp->dh_ref);
+	dls_head_t *dhp = (dls_head_t *)val;
+
+	/*
+	 * The lock order is  mod_hash's internal lock -> dh_lock as in the
+	 * call to i_dls_link_rx -> mod_hash_find_cb_rval -> i_dls_head_hold
+	 */
+	mutex_enter(&dhp->dh_lock);
+	if (dhp->dh_removing) {
+		mutex_exit(&dhp->dh_lock);
+		return (-1);
+	}
+	dhp->dh_ref++;
+	mutex_exit(&dhp->dh_lock);
+	return (0);
 }
 
-static void
+void
 i_dls_head_rele(dls_head_t *dhp)
 {
-	atomic_dec_32(&dhp->dh_ref);
+	mutex_enter(&dhp->dh_lock);
+	dhp->dh_ref--;
+	if (dhp->dh_ref == 0 && dhp->dh_removing != 0)
+		cv_broadcast(&dhp->dh_cv);
+	mutex_exit(&dhp->dh_lock);
 }
 
 static dls_head_t *
@@ -276,83 +268,86 @@ i_dls_head_free(dls_head_t *dhp)
  */
 static uint_t
 i_dls_link_rx_func(dls_link_t *dlp, mac_resource_handle_t mrh,
-    mac_header_info_t *mhip, mblk_t *mp, uint32_t sap, uint16_t vid,
+    mac_header_info_t *mhip, mblk_t *mp, uint32_t sap,
     boolean_t (*acceptfunc)())
 {
-	mod_hash_t	*hash = dlp->dl_impl_hash;
+	mod_hash_t	*hash = dlp->dl_str_hash;
 	mod_hash_key_t	key;
 	dls_head_t	*dhp;
-	dls_impl_t	*dip;
+	dld_str_t	*dsp;
 	mblk_t		*nmp;
-	dls_rx_t	di_rx;
-	void		*di_rx_arg;
+	dls_rx_t	ds_rx;
+	void		*ds_rx_arg;
 	uint_t		naccepted = 0;
+	int		rval;
 
 	/*
 	 * Construct a hash key from the VLAN identifier and the
-	 * DLSAP that represents dls_impl_t in promiscuous mode.
+	 * DLSAP that represents dld_str_t in promiscuous mode.
 	 */
-	key = MAKE_KEY(sap, vid);
+	key = MAKE_KEY(sap);
 
 	/*
-	 * Search the hash table for dls_impl_t eligible to receive
-	 * a packet chain for this DLSAP/VLAN combination.
+	 * Search the hash table for dld_str_t eligible to receive
+	 * a packet chain for this DLSAP/VLAN combination. The mod hash's
+	 * internal lock serializes find/insert/remove from the mod hash list.
+	 * Incrementing the dh_ref (while holding the mod hash lock) ensures
+	 * dls_link_remove will wait for the upcall to finish.
 	 */
-	rw_enter(&dlp->dl_impl_lock, RW_READER);
-	if (mod_hash_find(hash, key, (mod_hash_val_t *)&dhp) != 0) {
-		rw_exit(&dlp->dl_impl_lock);
+	if (mod_hash_find_cb_rval(hash, key, (mod_hash_val_t *)&dhp,
+	    i_dls_head_hold, &rval) != 0 || (rval != 0)) {
 		return (B_FALSE);
 	}
-	i_dls_head_hold(dhp);
-	rw_exit(&dlp->dl_impl_lock);
 
 	/*
-	 * Find dls_impl_t that will accept the sub-chain.
+	 * Find dld_str_t that will accept the sub-chain.
 	 */
-	for (dip = dhp->dh_list; dip != NULL; dip = dip->di_nextp) {
-		if (!acceptfunc(dip, mhip, &di_rx, &di_rx_arg))
+	for (dsp = dhp->dh_list; dsp != NULL; dsp = dsp->ds_next) {
+		if (!acceptfunc(dsp, mhip, &ds_rx, &ds_rx_arg))
 			continue;
 
 		/*
 		 * We have at least one acceptor.
 		 */
-		naccepted ++;
+		naccepted++;
 
 		/*
-		 * There will normally be at least more dls_impl_t
+		 * There will normally be at least more dld_str_t
 		 * (since we've yet to check for non-promiscuous
-		 * dls_impl_t) so dup the sub-chain.
+		 * dld_str_t) so dup the sub-chain.
 		 */
 		if ((nmp = copymsgchain(mp)) != NULL)
-			di_rx(di_rx_arg, mrh, nmp, mhip);
+			ds_rx(ds_rx_arg, mrh, nmp, mhip);
 	}
 
 	/*
-	 * Release the hold on the dls_impl_t chain now that we have
+	 * Release the hold on the dld_str_t chain now that we have
 	 * finished walking it.
 	 */
 	i_dls_head_rele(dhp);
 	return (naccepted);
 }
 
-static void
-i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+/* ARGSUSED */
+void
+i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
 {
 	dls_link_t			*dlp = arg;
-	mod_hash_t			*hash = dlp->dl_impl_hash;
+	mod_hash_t			*hash = dlp->dl_str_hash;
 	mblk_t				*nextp;
 	mac_header_info_t		mhi;
 	dls_head_t			*dhp;
-	dls_impl_t			*dip;
-	dls_impl_t			*ndip;
+	dld_str_t			*dsp;
+	dld_str_t			*ndsp;
 	mblk_t				*nmp;
 	mod_hash_key_t			key;
 	uint_t				npacket;
 	boolean_t			accepted;
-	dls_rx_t			di_rx, ndi_rx;
-	void				*di_rx_arg, *ndi_rx_arg;
+	dls_rx_t			ds_rx, nds_rx;
+	void				*ds_rx_arg, *nds_rx_arg;
 	uint16_t			vid;
-	int				err;
+	int				err, rval;
 
 	/*
 	 * Walk the packet chain.
@@ -384,11 +379,11 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
 		if (mhi.mhi_istagged) {
 			/*
 			 * If it is tagged traffic, send it upstream to
-			 * all dls_impl_t which are attached to the physical
+			 * all dld_str_t which are attached to the physical
 			 * link and bound to SAP 0x8100.
 			 */
 			if (i_dls_link_rx_func(dlp, mrh, &mhi, mp,
-			    ETHERTYPE_VLAN, VLAN_ID_NONE, dls_accept) > 0) {
+			    ETHERTYPE_VLAN, dls_accept) > 0) {
 				accepted = B_TRUE;
 			}
 
@@ -413,33 +408,30 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
 		 * Construct a hash key from the VLAN identifier and the
 		 * DLSAP.
 		 */
-		key = MAKE_KEY(mhi.mhi_bindsap, vid);
+		key = MAKE_KEY(mhi.mhi_bindsap);
 
 		/*
-		 * Search the has table for dls_impl_t eligible to receive
+		 * Search the has table for dld_str_t eligible to receive
 		 * a packet chain for this DLSAP/VLAN combination.
 		 */
-		rw_enter(&dlp->dl_impl_lock, RW_READER);
-		if (mod_hash_find(hash, key, (mod_hash_val_t *)&dhp) != 0) {
-			rw_exit(&dlp->dl_impl_lock);
+		if (mod_hash_find_cb_rval(hash, key, (mod_hash_val_t *)&dhp,
+		    i_dls_head_hold, &rval) != 0 || (rval != 0)) {
 			freemsgchain(mp);
 			goto loop;
 		}
-		i_dls_head_hold(dhp);
-		rw_exit(&dlp->dl_impl_lock);
 
 		/*
-		 * Find the first dls_impl_t that will accept the sub-chain.
+		 * Find the first dld_str_t that will accept the sub-chain.
 		 */
-		for (dip = dhp->dh_list; dip != NULL; dip = dip->di_nextp)
-			if (dls_accept(dip, &mhi, &di_rx, &di_rx_arg))
+		for (dsp = dhp->dh_list; dsp != NULL; dsp = dsp->ds_next)
+			if (dls_accept(dsp, &mhi, &ds_rx, &ds_rx_arg))
 				break;
 
 		/*
-		 * If we did not find any dls_impl_t willing to accept the
+		 * If we did not find any dld_str_t willing to accept the
 		 * sub-chain then throw it away.
 		 */
-		if (dip == NULL) {
+		if (dsp == NULL) {
 			i_dls_head_rele(dhp);
 			freemsgchain(mp);
 			goto loop;
@@ -451,43 +443,43 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
 		accepted = B_TRUE;
 		for (;;) {
 			/*
-			 * Find the next dls_impl_t that will accept the
+			 * Find the next dld_str_t that will accept the
 			 * sub-chain.
 			 */
-			for (ndip = dip->di_nextp; ndip != NULL;
-			    ndip = ndip->di_nextp)
-				if (dls_accept(ndip, &mhi, &ndi_rx,
-				    &ndi_rx_arg))
+			for (ndsp = dsp->ds_next; ndsp != NULL;
+			    ndsp = ndsp->ds_next)
+				if (dls_accept(ndsp, &mhi, &nds_rx,
+				    &nds_rx_arg))
 					break;
 
 			/*
-			 * If there are no more dls_impl_t that are willing
+			 * If there are no more dld_str_t that are willing
 			 * to accept the sub-chain then we don't need to dup
 			 * it before handing it to the current one.
 			 */
-			if (ndip == NULL) {
-				di_rx(di_rx_arg, mrh, mp, &mhi);
+			if (ndsp == NULL) {
+				ds_rx(ds_rx_arg, mrh, mp, &mhi);
 
 				/*
-				 * Since there are no more dls_impl_t, we're
+				 * Since there are no more dld_str_t, we're
 				 * done.
 				 */
 				break;
 			}
 
 			/*
-			 * There are more dls_impl_t so dup the sub-chain.
+			 * There are more dld_str_t so dup the sub-chain.
 			 */
 			if ((nmp = copymsgchain(mp)) != NULL)
-				di_rx(di_rx_arg, mrh, nmp, &mhi);
+				ds_rx(ds_rx_arg, mrh, nmp, &mhi);
 
-			dip = ndip;
-			di_rx = ndi_rx;
-			di_rx_arg = ndi_rx_arg;
+			dsp = ndsp;
+			ds_rx = nds_rx;
+			ds_rx_arg = nds_rx_arg;
 		}
 
 		/*
-		 * Release the hold on the dls_impl_t chain now that we have
+		 * Release the hold on the dld_str_t chain now that we have
 		 * finished walking it.
 		 */
 		i_dls_head_rele(dhp);
@@ -502,220 +494,119 @@ loop:
 	}
 }
 
-/*
- * Try to send mp up to the DLS_SAP_PROMISC listeners. Return B_TRUE if this
- * message is sent to any streams.
- */
-static uint_t
-i_dls_link_rx_common_promisc(dls_link_t *dlp, mac_resource_handle_t mrh,
-    mac_header_info_t *mhip, mblk_t *mp, uint16_t vid,
-    boolean_t (*acceptfunc)())
+/* ARGSUSED */
+void
+dls_rx_vlan_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
 {
-	uint_t naccepted;
+	dld_str_t			*dsp = arg;
+	dls_link_t			*dlp = dsp->ds_dlp;
+	mac_header_info_t		mhi;
+	dls_rx_t			ds_rx;
+	void				*ds_rx_arg;
+	int				err;
 
-	naccepted = i_dls_link_rx_func(dlp, mrh, mhip, mp, DLS_SAP_PROMISC,
-	    vid, acceptfunc);
+	DLS_PREPARE_PKT(dlp, mp, &mhi, err);
+	if (err != 0)
+		goto drop;
 
-	if (vid != VLAN_ID_NONE) {
-		naccepted += i_dls_link_rx_func(dlp, mrh, mhip, mp,
-		    DLS_SAP_PROMISC, VLAN_ID_NONE, acceptfunc);
+	/*
+	 * If there is promiscuous handle for vlan, we filter out the untagged
+	 * pkts and pkts that are not for the primary unicast address.
+	 */
+	if (dsp->ds_vlan_mph != NULL) {
+		uint8_t prim_addr[MAXMACADDRLEN];
+		size_t	addr_length = dsp->ds_mip->mi_addr_length;
+
+		if (!(mhi.mhi_istagged))
+			goto drop;
+		ASSERT(dsp->ds_mh != NULL);
+		mac_unicast_primary_get(dsp->ds_mh, (uint8_t *)prim_addr);
+		if (memcmp(mhi.mhi_daddr, prim_addr, addr_length) != 0)
+			goto drop;
+
+		if (!dls_accept(dsp, &mhi, &ds_rx, &ds_rx_arg))
+			goto drop;
+
+		ds_rx(ds_rx_arg, NULL, mp, &mhi);
+		return;
 	}
-	return (naccepted);
+
+drop:
+	atomic_add_32(&dlp->dl_unknowns, 1);
+	freemsg(mp);
 }
 
-static void
-i_dls_link_rx_common(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
-    boolean_t (*acceptfunc)())
+/* ARGSUSED */
+void
+dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
 {
-	dls_link_t			*dlp = arg;
-	mod_hash_t			*hash = dlp->dl_impl_hash;
-	mblk_t				*nextp;
+	dld_str_t			*dsp = arg;
+	dls_link_t			*dlp = dsp->ds_dlp;
 	mac_header_info_t		mhi;
-	uint16_t			vid, vidkey, pri;
+	dls_rx_t			ds_rx;
+	void				*ds_rx_arg;
+	int				err;
 	dls_head_t			*dhp;
-	dls_impl_t			*dip;
-	mblk_t				*nmp;
 	mod_hash_key_t			key;
-	uint_t				npacket;
-	uint32_t			sap;
-	boolean_t			accepted;
-	dls_rx_t			di_rx, fdi_rx;
-	void				*di_rx_arg, *fdi_rx_arg;
-	boolean_t			pass2;
-	int				err;
+
+	DLS_PREPARE_PKT(dlp, mp, &mhi, err);
+	if (err != 0)
+		goto drop;
 
 	/*
-	 * Walk the packet chain.
+	 * In order to filter out sap pkt that no dls channel listens, search
+	 * the hash table trying to find a dld_str_t eligible to receive the pkt
 	 */
-	for (; mp != NULL; mp = nextp) {
-		/*
-		 * Wipe the accepted state and the receive information of
-		 * the first eligible dls_impl_t.
-		 */
-		accepted = B_FALSE;
-		pass2 = B_FALSE;
-		fdi_rx = NULL;
-		fdi_rx_arg = NULL;
-
-		DLS_PREPARE_PKT(dlp, mp, &mhi, err);
-		if (err != 0) {
-			if (acceptfunc == dls_accept)
-				atomic_add_32(&(dlp->dl_unknowns), 1);
-			nextp = mp->b_next;
-			mp->b_next = NULL;
-			freemsg(mp);
-			continue;
-		}
-
-		/*
-		 * Grab the longest sub-chain we can process as a single
-		 * unit.
-		 */
-		nextp = i_dls_link_subchain(dlp, mp, &mhi, &npacket);
-		ASSERT(npacket != 0);
-
-		vid = VLAN_ID(mhi.mhi_tci);
-		pri = VLAN_PRI(mhi.mhi_tci);
-
-		vidkey = vid;
-
-		/*
-		 * Note that we need to first send to the dls_impl_t
-		 * in promiscuous mode in order to avoid the packet reordering
-		 * when snooping.
-		 */
-		if (i_dls_link_rx_common_promisc(dlp, mrh, &mhi, mp, vidkey,
-		    acceptfunc) > 0) {
-			accepted = B_TRUE;
-		}
-
-		/*
-		 * Non promisc case. Two passes:
-		 *   1. send tagged packets to ETHERTYPE_VLAN listeners
-		 *   2. send packets to listeners bound to the specific SAP.
-		 */
-		if (mhi.mhi_istagged) {
-			vidkey = VLAN_ID_NONE;
-			sap = ETHERTYPE_VLAN;
-		} else {
-			goto non_promisc_loop;
-		}
-non_promisc:
-		/*
-		 * Construct a hash key from the VLAN identifier and the
-		 * DLSAP.
-		 */
-		key = MAKE_KEY(sap, vidkey);
-
-		/*
-		 * Search the has table for dls_impl_t eligible to receive
-		 * a packet chain for this DLSAP/VLAN combination.
-		 */
-		rw_enter(&dlp->dl_impl_lock, RW_READER);
-		if (mod_hash_find(hash, key, (mod_hash_val_t *)&dhp) != 0) {
-			rw_exit(&dlp->dl_impl_lock);
-			goto non_promisc_loop;
-		}
-		i_dls_head_hold(dhp);
-		rw_exit(&dlp->dl_impl_lock);
-
-		/*
-		 * Find the first dls_impl_t that will accept the sub-chain.
-		 */
-		for (dip = dhp->dh_list; dip != NULL; dip = dip->di_nextp) {
-			if (!acceptfunc(dip, &mhi, &di_rx, &di_rx_arg))
-				continue;
-
-			accepted = B_TRUE;
-
-			/*
-			 * To avoid the extra copymsgchain(), if this
-			 * is the first eligible dls_impl_t, remember required
-			 * information and send up the message afterwards.
-			 */
-			if (fdi_rx == NULL) {
-				fdi_rx = di_rx;
-				fdi_rx_arg = di_rx_arg;
-				continue;
-			}
+	if ((dsp->ds_promisc & DLS_PROMISC_SAP) == 0) {
+		key = MAKE_KEY(mhi.mhi_bindsap);
+		if (mod_hash_find(dsp->ds_dlp->dl_str_hash, key,
+		    (mod_hash_val_t *)&dhp) != 0)
+			goto drop;
+	}
 
-			if ((nmp = copymsgchain(mp)) != NULL)
-				di_rx(di_rx_arg, mrh, nmp, &mhi);
-		}
+	if (!dls_accept_promisc(dsp, &mhi, &ds_rx, &ds_rx_arg, loopback))
+		goto drop;
 
-		/*
-		 * Release the hold on the dls_impl_t chain now that we have
-		 * finished walking it.
-		 */
-		i_dls_head_rele(dhp);
+	ds_rx(ds_rx_arg, NULL, mp, &mhi);
+	return;
 
-non_promisc_loop:
-		/*
-		 * Don't pass the packets up again if:
-		 * - First pass is done and the packets are tagged and their:
-		 *	- VID and priority are both zero (invalid packets).
-		 *	- their sap is ETHERTYPE_VLAN and their VID is zero
-		 *	  (they have already been sent upstreams).
-		 *  - Second pass is done:
-		 */
-		if (pass2 || (mhi.mhi_istagged &&
-		    ((vid == VLAN_ID_NONE && pri == 0) ||
-		    (mhi.mhi_bindsap == ETHERTYPE_VLAN &&
-		    vid == VLAN_ID_NONE)))) {
-			/*
-			 * Send the message up to the first eligible dls_impl_t.
-			 */
-			if (fdi_rx != NULL)
-				fdi_rx(fdi_rx_arg, mrh, mp, &mhi);
-			else
-				freemsgchain(mp);
-		} else {
-			vidkey = vid;
-			sap = mhi.mhi_bindsap;
-			pass2 = B_TRUE;
-			goto non_promisc;
-		}
-
-		/*
-		 * If there were no acceptors then add the packet count to the
-		 * 'unknown' count.
-		 */
-		if (!accepted && (acceptfunc == dls_accept))
-			atomic_add_32(&(dlp->dl_unknowns), npacket);
-	}
+drop:
+	atomic_add_32(&dlp->dl_unknowns, 1);
+	freemsg(mp);
 }
 
 static void
-i_dls_link_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
-{
-	i_dls_link_rx_common(arg, mrh, mp, dls_accept);
-}
-
-void
-dls_link_txloop(void *arg, mblk_t *mp)
+i_dls_link_destroy(dls_link_t *dlp)
 {
-	i_dls_link_rx_common(arg, NULL, mp, dls_accept_loopback);
-}
+	ASSERT(dlp->dl_nactive == 0);
+	ASSERT(dlp->dl_impl_count == 0);
+	ASSERT(dlp->dl_zone_ref == 0);
 
-/*ARGSUSED*/
-static uint_t
-i_dls_link_walk(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
-{
-	boolean_t 	*promiscp = arg;
-	uint32_t	sap = KEY_SAP(key);
+	/*
+	 * Free the structure back to the cache.
+	 */
+	if (dlp->dl_mch != NULL)
+		mac_client_close(dlp->dl_mch, 0);
 
-	if (sap == DLS_SAP_PROMISC) {
-		*promiscp = B_TRUE;
-		return (MH_WALK_TERMINATE);
+	if (dlp->dl_mh != NULL) {
+		ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
+		mac_close(dlp->dl_mh);
 	}
 
-	return (MH_WALK_CONTINUE);
+	dlp->dl_mh = NULL;
+	dlp->dl_mch = NULL;
+	dlp->dl_mip = NULL;
+	dlp->dl_unknowns = 0;
+	kmem_cache_free(i_dls_link_cachep, dlp);
 }
 
 static int
 i_dls_link_create(const char *name, dls_link_t **dlpp)
 {
 	dls_link_t		*dlp;
+	int			err;
 
 	/*
 	 * Allocate a new dls_link_t structure.
@@ -728,32 +619,34 @@ i_dls_link_create(const char *name, dls_link_t **dlpp)
 	(void) strlcpy(dlp->dl_name, name, sizeof (dlp->dl_name));
 
 	/*
-	 * Initialize promiscuous bookkeeping fields.
+	 * First reference; hold open the MAC interface.
 	 */
-	dlp->dl_npromisc = 0;
-	dlp->dl_mth = NULL;
+	ASSERT(dlp->dl_mh == NULL);
+	err = mac_open(dlp->dl_name, &dlp->dl_mh);
+	if (err != 0)
+		goto bail;
+
+	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
+	dlp->dl_mip = mac_info(dlp->dl_mh);
+
+	/* DLS is the "primary" MAC client */
+	ASSERT(dlp->dl_mch == NULL);
+
+	err = mac_client_open(dlp->dl_mh, &dlp->dl_mch, NULL,
+	    MAC_OPEN_FLAGS_TAG_DISABLE | MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK |
+	    MAC_OPEN_FLAGS_USE_DATALINK_NAME);
+	if (err != 0)
+		goto bail;
+
+	DTRACE_PROBE2(dls__primary__client, char *, dlp->dl_name, void *,
+	    dlp->dl_mch);
 
 	*dlpp = dlp;
 	return (0);
-}
 
-static void
-i_dls_link_destroy(dls_link_t *dlp)
-{
-	ASSERT(dlp->dl_npromisc == 0);
-	ASSERT(dlp->dl_nactive == 0);
-	ASSERT(dlp->dl_mth == NULL);
-	ASSERT(dlp->dl_macref == 0);
-	ASSERT(dlp->dl_mh == NULL);
-	ASSERT(dlp->dl_mip == NULL);
-	ASSERT(dlp->dl_impl_count == 0);
-	ASSERT(dlp->dl_mrh == NULL);
-
-	/*
-	 * Free the structure back to the cache.
-	 */
-	dlp->dl_unknowns = 0;
-	kmem_cache_free(i_dls_link_cachep, dlp);
+bail:
+	i_dls_link_destroy(dlp);
+	return (err);
 }
 
 /*
@@ -777,7 +670,6 @@ dls_link_init(void)
 	i_dls_link_hash = mod_hash_create_extended("dls_link_hash",
 	    IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
-	rw_init(&i_dls_link_lock, NULL, RW_DEFAULT, NULL);
 	i_dls_link_count = 0;
 }
 
@@ -796,7 +688,6 @@ dls_link_fini(void)
 	 * Destroy the hash table and associated lock.
 	 */
 	mod_hash_destroy_hash(i_dls_link_hash);
-	rw_destroy(&i_dls_link_lock);
 	return (0);
 }
 
@@ -804,32 +695,33 @@ dls_link_fini(void)
  * Exported functions.
  */
 
-int
-dls_link_hold(const char *name, dls_link_t **dlpp)
+static int
+dls_link_hold_common(const char *name, dls_link_t **dlpp, boolean_t create)
 {
 	dls_link_t		*dlp;
 	int			err;
 
 	/*
-	 * Look up a dls_link_t corresponding to the given mac_handle_t
-	 * in the global hash table. We need to hold i_dls_link_lock in
-	 * order to atomically find and insert a dls_link_t into the
-	 * hash table.
+	 * Look up a dls_link_t corresponding to the given macname in the
+	 * global hash table. The i_dls_link_hash itself is protected by the
+	 * mod_hash package's internal lock which synchronizes
+	 * find/insert/remove into the global mod_hash list. Assumes that
+	 * inserts and removes are single threaded on a per mac end point
+	 * by the mac perimeter.
 	 */
-	rw_enter(&i_dls_link_lock, RW_WRITER);
 	if ((err = mod_hash_find(i_dls_link_hash, (mod_hash_key_t)name,
 	    (mod_hash_val_t *)&dlp)) == 0)
 		goto done;
 
 	ASSERT(err == MH_ERR_NOTFOUND);
+	if (!create)
+		return (ENOENT);
 
 	/*
 	 * We didn't find anything so we need to create one.
 	 */
-	if ((err = i_dls_link_create(name, &dlp)) != 0) {
-		rw_exit(&i_dls_link_lock);
+	if ((err = i_dls_link_create(name, &dlp)) != 0)
 		return (err);
-	}
 
 	/*
 	 * Insert the dls_link_t.
@@ -838,124 +730,200 @@ dls_link_hold(const char *name, dls_link_t **dlpp)
 	    (mod_hash_val_t)dlp);
 	ASSERT(err == 0);
 
-	i_dls_link_count++;
+	atomic_add_32(&i_dls_link_count, 1);
 	ASSERT(i_dls_link_count != 0);
 
 done:
-
+	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
 	/*
 	 * Bump the reference count and hand back the reference.
 	 */
 	dlp->dl_ref++;
 	*dlpp = dlp;
-	rw_exit(&i_dls_link_lock);
 	return (0);
 }
 
+int
+dls_link_hold_create(const char *name, dls_link_t **dlpp)
+{
+	return (dls_link_hold_common(name, dlpp, B_TRUE));
+}
+
+int
+dls_link_hold(const char *name, dls_link_t **dlpp)
+{
+	return (dls_link_hold_common(name, dlpp, B_FALSE));
+}
+
+dev_info_t *
+dls_link_devinfo(dev_t dev)
+{
+	dls_link_t	*dlp;
+	dev_info_t	*dip;
+	char	macname[MAXNAMELEN];
+	char	*drv;
+	mac_perim_handle_t	mph;
+
+	if ((drv = ddi_major_to_name(getmajor(dev))) == NULL)
+		return (NULL);
+	(void) snprintf(macname, MAXNAMELEN, "%s%d", drv, getminor(dev) - 1);
+
+	/*
+	 * The code below assumes that the name constructed above is the
+	 * macname. This is not the case for legacy devices. Currently this
+	 * is ok because this function is only called in the getinfo(9e) path,
+	 * which for a legacy device would directly end up in the driver's
+	 * getinfo, rather than here
+	 */
+	if (mac_perim_enter_by_macname(macname, &mph) != 0)
+		return (NULL);
+
+	if (dls_link_hold(macname, &dlp) != 0) {
+		mac_perim_exit(mph);
+		return (NULL);
+	}
+
+	dip = mac_devinfo_get(dlp->dl_mh);
+	dls_link_rele(dlp);
+	mac_perim_exit(mph);
+
+	return (dip);
+}
+
+dev_t
+dls_link_dev(dls_link_t *dlp)
+{
+	return (makedevice(ddi_driver_major(mac_devinfo_get(dlp->dl_mh)),
+	    mac_minor(dlp->dl_mh)));
+}
+
 void
 dls_link_rele(dls_link_t *dlp)
 {
 	mod_hash_val_t	val;
 
-	rw_enter(&i_dls_link_lock, RW_WRITER);
-
+	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
 	/*
 	 * Check if there are any more references.
 	 */
-	if (--dlp->dl_ref != 0) {
+	if (--dlp->dl_ref == 0) {
+		(void) mod_hash_remove(i_dls_link_hash,
+		    (mod_hash_key_t)dlp->dl_name, &val);
+		ASSERT(dlp == (dls_link_t *)val);
+
 		/*
-		 * There are more references so there's nothing more to do.
+		 * Destroy the dls_link_t.
 		 */
-		goto done;
+		i_dls_link_destroy(dlp);
+		ASSERT(i_dls_link_count > 0);
+		atomic_add_32(&i_dls_link_count, -1);
 	}
+}
+
+int
+dls_link_rele_by_name(const char *name)
+{
+	dls_link_t		*dlp;
+
+	if (mod_hash_find(i_dls_link_hash, (mod_hash_key_t)name,
+	    (mod_hash_val_t *)&dlp) != 0)
+		return (ENOENT);
 
-	(void) mod_hash_remove(i_dls_link_hash,
-	    (mod_hash_key_t)dlp->dl_name, &val);
-	ASSERT(dlp == (dls_link_t *)val);
+	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
 
 	/*
-	 * Destroy the dls_link_t.
+	 * Must fail detach if mac client is busy.
 	 */
-	i_dls_link_destroy(dlp);
-	ASSERT(i_dls_link_count > 0);
-	i_dls_link_count--;
-done:
-	rw_exit(&i_dls_link_lock);
+	ASSERT(dlp->dl_ref > 0 && dlp->dl_mch != NULL);
+	if (mac_link_has_flows(dlp->dl_mch))
+		return (ENOTEMPTY);
+
+	dls_link_rele(dlp);
+	return (0);
 }
 
 int
-dls_mac_hold(dls_link_t *dlp)
+dls_link_setzid(const char *name, zoneid_t zid)
 {
-	mac_handle_t mh;
-	int err = 0;
+	dls_link_t	*dlp;
+	int		err = 0;
+	zoneid_t	old_zid;
+
+	if ((err = dls_link_hold_create(name, &dlp)) != 0)
+		return (err);
 
-	err = mac_open(dlp->dl_name, &mh);
+	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
 
-	mutex_enter(&dlp->dl_lock);
+	if ((old_zid = dlp->dl_zid) == zid)
+		goto done;
 
-	ASSERT(IMPLY(dlp->dl_macref != 0, dlp->dl_mh != NULL));
-	ASSERT(IMPLY(dlp->dl_macref == 0, dlp->dl_mh == NULL));
-	if (err == 0) {
-		ASSERT(dlp->dl_mh == NULL || dlp->dl_mh == mh);
-		if (dlp->dl_mh == NULL) {
-			dlp->dl_mh = mh;
-			dlp->dl_mip = mac_info(mh);
+	/*
+	 * Check whether this dlp is used by its own zones, if yes,
+	 * we cannot change its zoneid.
+	 */
+	if (dlp->dl_zone_ref != 0) {
+		err = EBUSY;
+		goto done;
+	}
+
+	if (zid == GLOBAL_ZONEID) {
+		/*
+		 * Move the link from the local zone to the global zone,
+		 * and release the reference to this link.  At the same time
+		 * reset the link's active state so that an aggregation is
+		 * allowed to be created over it.
+		 */
+		dlp->dl_zid = zid;
+		dls_mac_active_clear(dlp);
+		dls_link_rele(dlp);
+		goto done;
+	} else if (old_zid == GLOBAL_ZONEID) {
+		/*
+		 * Move the link from the global zone to the local zone,
+		 * and hold a reference to this link.  Also, set the link
+		 * to the "active" state so that the global zone is
+		 * not able to create an aggregation over this link.
+		 * TODO: revisit once we allow creating aggregations
+		 * within a local zone.
+		 */
+		if ((err = dls_mac_active_set(dlp)) != 0) {
+			if (err != ENXIO)
+				err = EBUSY;
+			goto done;
 		}
-		dlp->dl_macref++;
+		dlp->dl_zid = zid;
+		return (0);
+	} else {
+		/*
+		 * Move the link from a local zone to another local zone.
+		 */
+		dlp->dl_zid = zid;
 	}
 
-	mutex_exit(&dlp->dl_lock);
+done:
+	dls_link_rele(dlp);
 	return (err);
 }
 
 void
-dls_mac_rele(dls_link_t *dlp)
-{
-	mutex_enter(&dlp->dl_lock);
-	ASSERT(dlp->dl_mh != NULL);
-
-	mac_close(dlp->dl_mh);
-
-	if (--dlp->dl_macref == 0) {
-		dlp->dl_mh = NULL;
-		dlp->dl_mip = NULL;
-	}
-	mutex_exit(&dlp->dl_lock);
-}
-
-void
-dls_link_add(dls_link_t *dlp, uint32_t sap, dls_impl_t *dip)
+dls_link_add(dls_link_t *dlp, uint32_t sap, dld_str_t *dsp)
 {
-	dls_vlan_t	*dvp = dip->di_dvp;
-	mod_hash_t	*hash = dlp->dl_impl_hash;
+	mod_hash_t	*hash = dlp->dl_str_hash;
 	mod_hash_key_t	key;
 	dls_head_t	*dhp;
-	dls_impl_t	*p;
-	mac_rx_t	rx;
+	dld_str_t	*p;
 	int		err;
-	boolean_t	promisc = B_FALSE;
 
-	/*
-	 * Generate a hash key based on the sap and the VLAN id.
-	 */
-	key = MAKE_KEY(sap, dvp->dv_id);
+	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
 
 	/*
-	 * We need dl_lock here because we want to be able to walk
-	 * the hash table *and* set the mac rx func atomically. if
-	 * these two operations are separate, someone else could
-	 * insert/remove dls_impl_t from the hash table after we
-	 * drop the hash lock and this could cause our chosen rx
-	 * func to be incorrect. note that we cannot call mac_rx_add
-	 * when holding the hash lock because this can cause deadlock.
+	 * Generate a hash key based on the sap.
 	 */
-	mutex_enter(&dlp->dl_lock);
+	key = MAKE_KEY(sap);
 
 	/*
 	 * Search the table for a list head with this key.
 	 */
-	rw_enter(&dlp->dl_impl_lock, RW_WRITER);
-
 	if ((err = mod_hash_find(hash, key, (mod_hash_val_t *)&dhp)) != 0) {
 		ASSERT(err == MH_ERR_NOTFOUND);
 
@@ -965,94 +933,68 @@ dls_link_add(dls_link_t *dlp, uint32_t sap, dls_impl_t *dip)
 	}
 
 	/*
-	 * Add the dls_impl_t to the head of the list.
+	 * Add the dld_str_t to the head of the list. List walkers in
+	 * i_dls_link_rx_* bump up dh_ref to ensure the list does not change
+	 * while they walk the list. The membar below ensures that list walkers
+	 * see exactly the old list or the new list.
 	 */
-	ASSERT(dip->di_nextp == NULL);
+	ASSERT(dsp->ds_next == NULL);
 	p = dhp->dh_list;
-	dip->di_nextp = p;
-	dhp->dh_list = dip;
+	dsp->ds_next = p;
 
-	/*
-	 * Save a pointer to the list head.
-	 */
-	dip->di_headp = dhp;
-	dlp->dl_impl_count++;
+	membar_producer();
 
-	/*
-	 * Walk the bound dls_impl_t to see if there are any
-	 * in promiscuous 'all sap' mode.
-	 */
-	mod_hash_walk(hash, i_dls_link_walk, (void *)&promisc);
-	rw_exit(&dlp->dl_impl_lock);
+	dhp->dh_list = dsp;
 
 	/*
-	 * If there are then we need to use a receive routine
-	 * which will route packets to those dls_impl_t as well
-	 * as ones bound to the  DLSAP of the packet.
+	 * Save a pointer to the list head.
 	 */
-	if (promisc)
-		rx = i_dls_link_rx_promisc;
-	else
-		rx = i_dls_link_rx;
-
-	/* Replace the existing receive function if there is one. */
-	if (dlp->dl_mrh != NULL)
-		mac_rx_remove(dlp->dl_mh, dlp->dl_mrh, B_TRUE);
-	dlp->dl_mrh = mac_active_rx_add(dlp->dl_mh, rx, (void *)dlp);
-	mutex_exit(&dlp->dl_lock);
+	dsp->ds_head = dhp;
+	dlp->dl_impl_count++;
 }
 
 void
-dls_link_remove(dls_link_t *dlp, dls_impl_t *dip)
+dls_link_remove(dls_link_t *dlp, dld_str_t *dsp)
 {
-	mod_hash_t	*hash = dlp->dl_impl_hash;
-	dls_impl_t	**pp;
-	dls_impl_t	*p;
+	mod_hash_t	*hash = dlp->dl_str_hash;
+	dld_str_t	**pp;
+	dld_str_t	*p;
 	dls_head_t	*dhp;
-	mac_rx_t	rx;
 
-	/*
-	 * We need dl_lock here because we want to be able to walk
-	 * the hash table *and* set the mac rx func atomically. if
-	 * these two operations are separate, someone else could
-	 * insert/remove dls_impl_t from the hash table after we
-	 * drop the hash lock and this could cause our chosen rx
-	 * func to be incorrect. note that we cannot call mac_rx_add
-	 * when holding the hash lock because this can cause deadlock.
-	 */
-	mutex_enter(&dlp->dl_lock);
-	rw_enter(&dlp->dl_impl_lock, RW_WRITER);
+	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
 
 	/*
-	 * Poll the hash table entry until all references have been dropped.
-	 * We need to drop all locks before sleeping because we don't want
-	 * the interrupt handler to block. We set di_removing here to
-	 * tell the receive callbacks not to pass up packets anymore.
-	 * This is only a hint to quicken the decrease of the refcnt so
-	 * the assignment need not be protected by any lock.
+	 * We set dh_removing here to tell the receive callbacks not to pass
+	 * up packets anymore. Then wait till the current callbacks are done.
+	 * This happens either in the close path or in processing the
+	 * DL_UNBIND_REQ via a taskq thread, and it is ok to cv_wait in either.
+	 * The dh_ref ensures there aren't and there won't be any upcalls
+	 * walking or using the dh_list. The mod hash internal lock ensures
+	 * that the insert/remove of the dls_head_t itself synchronizes with
+	 * any i_dls_link_rx trying to locate it. The perimeter ensures that
+	 * there isn't another simultaneous dls_link_add/remove.
 	 */
-	dhp = dip->di_headp;
-	dip->di_removing = B_TRUE;
-	while (dhp->dh_ref != 0) {
-		rw_exit(&dlp->dl_impl_lock);
-		mutex_exit(&dlp->dl_lock);
-		delay(drv_usectohz(1000));	/* 1ms delay */
-		mutex_enter(&dlp->dl_lock);
-		rw_enter(&dlp->dl_impl_lock, RW_WRITER);
-	}
+	dhp = dsp->ds_head;
+
+	mutex_enter(&dhp->dh_lock);
+	dhp->dh_removing = B_TRUE;
+	while (dhp->dh_ref != 0)
+		cv_wait(&dhp->dh_cv, &dhp->dh_lock);
+	mutex_exit(&dhp->dh_lock);
 
 	/*
-	 * Walk the list and remove the dls_impl_t.
+	 * Walk the list and remove the dld_str_t.
 	 */
-	for (pp = &dhp->dh_list; (p = *pp) != NULL; pp = &(p->di_nextp)) {
-		if (p == dip)
+	for (pp = &dhp->dh_list; (p = *pp) != NULL; pp = &(p->ds_next)) {
+		if (p == dsp)
 			break;
 	}
 	ASSERT(p != NULL);
-	*pp = p->di_nextp;
-	p->di_nextp = NULL;
+	*pp = p->ds_next;
+	p->ds_next = NULL;
+	p->ds_head = NULL;
 
-	ASSERT(dlp->dl_impl_count > 0);
+	ASSERT(dlp->dl_impl_count != 0);
 	dlp->dl_impl_count--;
 
 	if (dhp->dh_list == NULL) {
@@ -1064,41 +1006,11 @@ dls_link_remove(dls_link_t *dlp, dls_impl_t *dip)
 		(void) mod_hash_remove(hash, dhp->dh_key, &val);
 		ASSERT(dhp == (dls_head_t *)val);
 		i_dls_head_free(dhp);
-	}
-	dip->di_removing = B_FALSE;
-
-	/*
-	 * If there are no dls_impl_t then there's no need to register a
-	 * receive function with the mac.
-	 */
-	if (dlp->dl_impl_count == 0) {
-		rw_exit(&dlp->dl_impl_lock);
-		mac_rx_remove(dlp->dl_mh, dlp->dl_mrh, B_TRUE);
-		dlp->dl_mrh = NULL;
 	} else {
-		boolean_t promisc = B_FALSE;
-
-		/*
-		 * Walk the bound dls_impl_t to see if there are any
-		 * in promiscuous 'all sap' mode.
-		 */
-		mod_hash_walk(hash, i_dls_link_walk, (void *)&promisc);
-		rw_exit(&dlp->dl_impl_lock);
-
-		/*
-		 * If there are then we need to use a receive routine
-		 * which will route packets to those dls_impl_t as well
-		 * as ones bound to the  DLSAP of the packet.
-		 */
-		if (promisc)
-			rx = i_dls_link_rx_promisc;
-		else
-			rx = i_dls_link_rx;
-
-		mac_rx_remove(dlp->dl_mh, dlp->dl_mrh, B_TRUE);
-		dlp->dl_mrh = mac_active_rx_add(dlp->dl_mh, rx, (void *)dlp);
+		mutex_enter(&dhp->dh_lock);
+		dhp->dh_removing = B_FALSE;
+		mutex_exit(&dhp->dh_lock);
 	}
-	mutex_exit(&dlp->dl_lock);
 }
 
 int
@@ -1153,10 +1065,5 @@ dls_link_header_info(dls_link_t *dlp, mblk_t *mp, mac_header_info_t *mhip)
 		mhip->mhi_tci = 0;
 	}
 
-	/*
-	 * The messsage is looped back from the underlying driver.
-	 */
-	mhip->mhi_prom_looped = (mp->b_flag & MSGNOLOOP);
-
 	return (0);
 }
diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c
index bf5fc0a814..bb922423b3 100644
--- a/usr/src/uts/common/io/dls/dls_mgmt.c
+++ b/usr/src/uts/common/io/dls/dls_mgmt.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Datalink management routines.
  */
@@ -38,11 +36,17 @@
 #include <sys/kstat.h>
 #include <sys/vnode.h>
 #include <sys/cmn_err.h>
-#include <sys/vlan.h>
 #include <sys/softmac.h>
 #include <sys/dls.h>
 #include <sys/dls_impl.h>
 
+/*
+ * This vanity name management module is treated as part of the GLD framework
+ * and we don't hold any GLD framework lock across a call to any mac
+ * function that needs to acquire the mac perimeter. The hierarchy is
+ * mac perimeter -> framework locks
+ */
+
 static kmem_cache_t	*i_dls_devnet_cachep;
 static kmutex_t		i_dls_mgmt_lock;
 static krwlock_t	i_dls_devnet_lock;
@@ -56,25 +60,22 @@ boolean_t		devnet_need_rebuild;
 /* Upcall door handle */
 static door_handle_t	dls_mgmt_dh = NULL;
 
+#define	DD_CONDEMNED	0x1
+
 /*
- * This structure is used to keep the <linkid, macname, vid> mapping.
+ * This structure is used to keep the <linkid, macname> mapping.
  */
 typedef struct dls_devnet_s {
-	datalink_id_t	dd_vlanid;
 	datalink_id_t	dd_linkid;
 	char		dd_mac[MAXNAMELEN];
-	uint16_t	dd_vid;
-	char		dd_spa[MAXSPALEN];
-	boolean_t	dd_explicit;
 	kstat_t		*dd_ksp;
-
 	uint32_t	dd_ref;
 
 	kmutex_t	dd_mutex;
 	kcondvar_t	dd_cv;
 	uint32_t	dd_tref;
+	uint_t		dd_flags;
 
-	kmutex_t	dd_zid_mutex;
 	zoneid_t	dd_zid;
 
 	boolean_t	dd_prop_loaded;
@@ -90,7 +91,6 @@ i_dls_devnet_constructor(void *buf, void *arg, int kmflag)
 
 	bzero(buf, sizeof (dls_devnet_t));
 	mutex_init(&ddp->dd_mutex, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&ddp->dd_zid_mutex, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ddp->dd_cv, NULL, CV_DEFAULT, NULL);
 	return (0);
 }
@@ -104,9 +104,7 @@ i_dls_devnet_destructor(void *buf, void *arg)
 	ASSERT(ddp->dd_ksp == NULL);
 	ASSERT(ddp->dd_ref == 0);
 	ASSERT(ddp->dd_tref == 0);
-	ASSERT(!ddp->dd_explicit);
 	mutex_destroy(&ddp->dd_mutex);
-	mutex_destroy(&ddp->dd_zid_mutex);
 	cv_destroy(&ddp->dd_cv);
 }
 
@@ -128,13 +126,13 @@ dls_mgmt_init(void)
 	ASSERT(i_dls_devnet_cachep != NULL);
 
 	/*
-	 * Create a hash table, keyed by dd_vlanid, of dls_devnet_t.
+	 * Create a hash table, keyed by dd_linkid, of dls_devnet_t.
 	 */
 	i_dls_devnet_id_hash = mod_hash_create_idhash("dls_devnet_id_hash",
 	    VLAN_HASHSZ, mod_hash_null_valdtor);
 
 	/*
-	 * Create a hash table, keyed by dd_spa.
+	 * Create a hash table, keyed by dd_mac
 	 */
 	i_dls_devnet_hash = mod_hash_create_extended("dls_devnet_hash",
 	    VLAN_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
@@ -310,7 +308,6 @@ done:
  *		        registration of its mac
  *     - class		datalink class
  *     - media type	media type; DL_OTHER means unknown
- *     - vid		VLAN ID (for VLANs)
  *     - persist	whether to persist the datalink
  */
 int
@@ -546,7 +543,7 @@ dls_devnet_prop_task(void *arg)
 {
 	dls_devnet_t		*ddp = arg;
 
-	(void) dls_mgmt_linkprop_init(ddp->dd_vlanid);
+	(void) dls_mgmt_linkprop_init(ddp->dd_linkid);
 
 	mutex_enter(&ddp->dd_mutex);
 	ddp->dd_prop_loaded = B_TRUE;
@@ -567,58 +564,48 @@ dls_devnet_prop_task_wait(dls_dl_handle_t ddp)
 	mutex_exit(&ddp->dd_mutex);
 }
 
-/*
- * Hold the vanity naming structure (dls_devnet_t) temporarily.  The request to
- * delete the dls_devnet_t will wait until the temporary reference is released.
- */
+void
+dls_devnet_rele_tmp(dls_dl_handle_t dlh)
+{
+	dls_devnet_t		*ddp = dlh;
+
+	mutex_enter(&ddp->dd_mutex);
+	ASSERT(ddp->dd_tref != 0);
+	if (--ddp->dd_tref == 0)
+		cv_signal(&ddp->dd_cv);
+	mutex_exit(&ddp->dd_mutex);
+}
+
 int
-dls_devnet_hold_tmp(datalink_id_t linkid, dls_dl_handle_t *ddhp)
+dls_devnet_hold_link(datalink_id_t linkid, dls_dl_handle_t *ddhp,
+    dls_link_t **dlpp)
 {
-	dls_devnet_t		*ddp;
-	dls_dev_handle_t	ddh = NULL;
-	dev_t			phydev = 0;
-	int			err;
+	dls_dl_handle_t	dlh;
+	dls_link_t	*dlp;
+	int		err;
 
-	/*
-	 * Hold this link to prevent it being detached (if physical link).
-	 */
-	if (dls_mgmt_get_phydev(linkid, &phydev) == 0)
-		(void) softmac_hold_device(phydev, &ddh);
+	if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0)
+		return (err);
 
-	rw_enter(&i_dls_devnet_lock, RW_READER);
-	if ((err = mod_hash_find(i_dls_devnet_id_hash,
-	    (mod_hash_key_t)(uintptr_t)linkid, (mod_hash_val_t *)&ddp)) != 0) {
-		ASSERT(err == MH_ERR_NOTFOUND);
-		rw_exit(&i_dls_devnet_lock);
-		softmac_rele_device(ddh);
-		return (ENOENT);
+	if ((err = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) {
+		dls_devnet_rele_tmp(dlh);
+		return (err);
 	}
 
-	/*
-	 * At least one reference was held when this datalink was created.
-	 */
-	ASSERT(ddp->dd_ref > 0);
-	mutex_enter(&ddp->dd_mutex);
-	ddp->dd_tref++;
-	mutex_exit(&ddp->dd_mutex);
-	rw_exit(&i_dls_devnet_lock);
-	softmac_rele_device(ddh);
+	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
 
-done:
-	*ddhp = ddp;
+	*ddhp = dlh;
+	*dlpp = dlp;
 	return (0);
 }
 
 void
-dls_devnet_rele_tmp(dls_dl_handle_t dlh)
+dls_devnet_rele_link(dls_dl_handle_t dlh, dls_link_t *dlp)
 {
-	dls_devnet_t		*ddp = dlh;
+	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
 
-	mutex_enter(&ddp->dd_mutex);
-	ASSERT(ddp->dd_tref != 0);
-	if (--ddp->dd_tref == 0)
-		cv_signal(&ddp->dd_cv);
-	mutex_exit(&ddp->dd_mutex);
+	dls_link_rele(dlp);
+	dls_devnet_rele_tmp(dlh);
 }
 
 /*
@@ -632,15 +619,23 @@ static int
 dls_devnet_stat_update(kstat_t *ksp, int rw)
 {
 	dls_devnet_t	*ddp = ksp->ks_private;
-	dls_vlan_t	*dvp;
+	dls_link_t	*dlp;
 	int		err;
+	mac_perim_handle_t	mph;
 
-	err = dls_vlan_hold(ddp->dd_mac, ddp->dd_vid, &dvp, B_FALSE, B_FALSE);
+	err = mac_perim_enter_by_macname(ddp->dd_mac, &mph);
 	if (err != 0)
 		return (err);
 
-	err = dls_stat_update(ksp, dvp, rw);
-	dls_vlan_rele(dvp);
+	err = dls_link_hold(ddp->dd_mac, &dlp);
+	if (err != 0) {
+		mac_perim_exit(mph);
+		return (err);
+	}
+
+	err = dls_stat_update(ksp, dlp, rw);
+	dls_link_rele(dlp);
+	mac_perim_exit(mph);
 	return (err);
 }
 
@@ -653,7 +648,7 @@ dls_devnet_stat_create(dls_devnet_t *ddp)
 	char	link[MAXLINKNAMELEN];
 	kstat_t	*ksp;
 
-	if ((dls_mgmt_get_linkinfo(ddp->dd_vlanid, link,
+	if ((dls_mgmt_get_linkinfo(ddp->dd_linkid, link,
 	    NULL, NULL, NULL)) != 0) {
 		return;
 	}
@@ -704,114 +699,53 @@ dls_devnet_stat_rename(dls_devnet_t *ddp, const char *link)
 }
 
 /*
- * Associate a linkid with a given link (identified by <macname/vid>)
- *
- * Several cases:
- * a. implicit VLAN creation: (non-NULL "vlan")
- * b. explicit VLAN creation: (NULL "vlan")
- * c. explicit non-VLAN creation:
- *    (NULL "vlan" and linkid could be INVALID_LINKID if the physical device
- *    was created before the daemon was started)
+ * Associate a linkid with a given link (identified by macname)
  */
 static int
-dls_devnet_set(const char *macname, uint16_t vid,
-    datalink_id_t vlan_linkid, datalink_id_t linkid, const char *vlan,
-    dls_devnet_t **ddpp)
+dls_devnet_set(const char *macname, datalink_id_t linkid, dls_devnet_t **ddpp)
 {
 	dls_devnet_t		*ddp = NULL;
-	char			spa[MAXSPALEN];
-	boolean_t		explicit = (vlan == NULL);
 	datalink_class_t	class;
 	int			err;
 
-	ASSERT(vid != VLAN_ID_NONE || explicit);
-	ASSERT(vlan_linkid != DATALINK_INVALID_LINKID || !explicit ||
-	    vid == VLAN_ID_NONE);
-
-	(void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid);
 	rw_enter(&i_dls_devnet_lock, RW_WRITER);
 	if ((err = mod_hash_find(i_dls_devnet_hash,
-	    (mod_hash_key_t)spa, (mod_hash_val_t *)&ddp)) == 0) {
-		char	link[MAXLINKNAMELEN];
-
-		if (explicit) {
-			if ((vid != VLAN_ID_NONE) ||
-			    (ddp->dd_vlanid != DATALINK_INVALID_LINKID)) {
-				err = EEXIST;
-				goto done;
-			}
-
-			/*
-			 * This might be a physical link that has already
-			 * been created, but which does not have a vlan_linkid
-			 * because dlmgmtd was not running when it was created.
-			 */
-			if ((err = dls_mgmt_get_linkinfo(vlan_linkid, NULL,
-			    &class, NULL, NULL)) != 0) {
-				goto done;
-			}
-
-			if (class != DATALINK_CLASS_PHYS) {
-				err = EINVAL;
-				goto done;
-			}
-
-			goto newphys;
+	    (mod_hash_key_t)macname, (mod_hash_val_t *)&ddp)) == 0) {
+		if (ddp->dd_linkid != DATALINK_INVALID_LINKID) {
+			err = EEXIST;
+			goto done;
 		}
 
 		/*
-		 * Implicit VLAN, but the same name has already
-		 * been associated with another linkid.  Check if the name
-		 * of that link matches the given VLAN name.
+		 * This might be a physical link that has already
+		 * been created, but which does not have a linkid
+		 * because dlmgmtd was not running when it was created.
 		 */
-		ASSERT(vid != VLAN_ID_NONE);
-		if ((err = dls_mgmt_get_linkinfo(ddp->dd_vlanid, link,
-		    NULL, NULL, NULL)) != 0) {
+		if ((err = dls_mgmt_get_linkinfo(linkid, NULL,
+		    &class, NULL, NULL)) != 0) {
 			goto done;
 		}
 
-		if (strcmp(link, vlan) != 0) {
-			err = EEXIST;
+		if (class != DATALINK_CLASS_PHYS) {
+			err = EINVAL;
 			goto done;
 		}
 
-		/*
-		 * This is not an implicit created VLAN any more, return
-		 * this existing datalink.
-		 */
-		ASSERT(ddp->dd_ref > 0);
-		ddp->dd_ref++;
-		goto done;
-	}
-
-	/*
-	 * Request the daemon to create a new vlan_linkid for this implicitly
-	 * created vlan.
-	 */
-	if (!explicit && ((err = dls_mgmt_create(vlan, 0,
-	    DATALINK_CLASS_VLAN, DL_ETHER, B_FALSE, &vlan_linkid)) != 0)) {
-		goto done;
+		goto newphys;
 	}
-
 	ddp = kmem_cache_alloc(i_dls_devnet_cachep, KM_SLEEP);
-	ddp->dd_vid = vid;
-	ddp->dd_explicit = explicit;
 	ddp->dd_tref = 0;
 	ddp->dd_ref++;
 	ddp->dd_zid = GLOBAL_ZONEID;
 	(void) strncpy(ddp->dd_mac, macname, MAXNAMELEN);
-	(void) snprintf(ddp->dd_spa, MAXSPALEN, "%s/%d", macname, vid);
 	VERIFY(mod_hash_insert(i_dls_devnet_hash,
-	    (mod_hash_key_t)ddp->dd_spa, (mod_hash_val_t)ddp) == 0);
+	    (mod_hash_key_t)ddp->dd_mac, (mod_hash_val_t)ddp) == 0);
 
 newphys:
-
-	ddp->dd_vlanid = vlan_linkid;
-	if (ddp->dd_vlanid != DATALINK_INVALID_LINKID) {
+	if (linkid != DATALINK_INVALID_LINKID) {
 		ddp->dd_linkid = linkid;
-
 		VERIFY(mod_hash_insert(i_dls_devnet_id_hash,
-		    (mod_hash_key_t)(uintptr_t)vlan_linkid,
+		    (mod_hash_key_t)(uintptr_t)linkid,
 		    (mod_hash_val_t)ddp) == 0);
 		devnet_need_rebuild = B_TRUE;
 		dls_devnet_stat_create(ddp);
@@ -832,90 +766,83 @@ done:
 	return (err);
 }
 
-static void
-dls_devnet_unset_common(dls_devnet_t *ddp)
-{
-	mod_hash_val_t	val;
-
-	ASSERT(RW_WRITE_HELD(&i_dls_devnet_lock));
-
-	ASSERT(ddp->dd_ref == 0);
-
-	/*
-	 * Remove this dls_devnet_t from the hash table.
-	 */
-	VERIFY(mod_hash_remove(i_dls_devnet_hash,
-	    (mod_hash_key_t)ddp->dd_spa, &val) == 0);
-
-	if (ddp->dd_vlanid != DATALINK_INVALID_LINKID) {
-		VERIFY(mod_hash_remove(i_dls_devnet_id_hash,
-		    (mod_hash_key_t)(uintptr_t)ddp->dd_vlanid, &val) == 0);
-
-		dls_devnet_stat_destroy(ddp);
-		devnet_need_rebuild = B_TRUE;
-	}
-
-	/*
-	 * Wait until all temporary references are released.
-	 */
-	mutex_enter(&ddp->dd_mutex);
-	while ((ddp->dd_tref != 0) || (ddp->dd_prop_taskid != NULL))
-		cv_wait(&ddp->dd_cv, &ddp->dd_mutex);
-
-	ddp->dd_prop_loaded = B_FALSE;
-	mutex_exit(&ddp->dd_mutex);
-
-	if (!ddp->dd_explicit) {
-		ASSERT(ddp->dd_vid != VLAN_ID_NONE);
-		ASSERT(ddp->dd_vlanid != DATALINK_INVALID_LINKID);
-		(void) dls_mgmt_destroy(ddp->dd_vlanid, B_FALSE);
-	}
-
-	ddp->dd_vlanid = DATALINK_INVALID_LINKID;
-	ddp->dd_zid = GLOBAL_ZONEID;
-	ddp->dd_explicit = B_FALSE;
-	kmem_cache_free(i_dls_devnet_cachep, ddp);
-}
-
 /*
- * Disassociate a linkid with a given link (identified by <macname/vid>)
+ * Disassociate a linkid with a given link (identified by macname)
+ * This waits until temporary references to the dls_devnet_t are gone.
  */
 static int
-dls_devnet_unset(const char *macname, uint16_t vid, datalink_id_t *id)
+dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
 {
 	dls_devnet_t	*ddp;
-	char		spa[MAXSPALEN];
 	int		err;
-
-	(void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid);
+	mod_hash_val_t	val;
 
 	rw_enter(&i_dls_devnet_lock, RW_WRITER);
 	if ((err = mod_hash_find(i_dls_devnet_hash,
-	    (mod_hash_key_t)spa, (mod_hash_val_t *)&ddp)) != 0) {
+	    (mod_hash_key_t)macname, (mod_hash_val_t *)&ddp)) != 0) {
 		ASSERT(err == MH_ERR_NOTFOUND);
 		rw_exit(&i_dls_devnet_lock);
 		return (ENOENT);
 	}
 
-	ASSERT(ddp->dd_ref != 0);
+	mutex_enter(&ddp->dd_mutex);
 
-	if (ddp->dd_ref != 1) {
+	/*
+	 * Make sure downcalls into softmac_create or softmac_destroy from
+	 * devfs don't cv_wait on any devfs related condition for fear of
+	 * deadlock. Return EBUSY if the asynchronous thread started for
+	 * property loading as part of the post attach hasn't yet completed.
+	 */
+	ASSERT(ddp->dd_ref != 0);
+	if ((ddp->dd_ref != 1) || (!wait &&
+	    (ddp->dd_tref != 0 || ddp->dd_prop_taskid != NULL))) {
+		mutex_exit(&ddp->dd_mutex);
 		rw_exit(&i_dls_devnet_lock);
 		return (EBUSY);
 	}
 
+	ddp->dd_flags |= DD_CONDEMNED;
 	ddp->dd_ref--;
+	*id = ddp->dd_linkid;
 
-	if (id != NULL)
-		*id = ddp->dd_vlanid;
+	/*
+	 * Remove this dls_devnet_t from the hash table.
+	 */
+	VERIFY(mod_hash_remove(i_dls_devnet_hash,
+	    (mod_hash_key_t)ddp->dd_mac, &val) == 0);
 
-	dls_devnet_unset_common(ddp);
+	if (ddp->dd_linkid != DATALINK_INVALID_LINKID) {
+		VERIFY(mod_hash_remove(i_dls_devnet_id_hash,
+		    (mod_hash_key_t)(uintptr_t)ddp->dd_linkid, &val) == 0);
+
+		dls_devnet_stat_destroy(ddp);
+		devnet_need_rebuild = B_TRUE;
+	}
 	rw_exit(&i_dls_devnet_lock);
+
+	if (wait) {
+		/*
+		 * Wait until all temporary references are released.
+		 */
+		while ((ddp->dd_tref != 0) || (ddp->dd_prop_taskid != NULL))
+			cv_wait(&ddp->dd_cv, &ddp->dd_mutex);
+	} else {
+		ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL);
+	}
+
+	ddp->dd_prop_loaded = B_FALSE;
+	ddp->dd_linkid = DATALINK_INVALID_LINKID;
+	ddp->dd_zid = GLOBAL_ZONEID;
+	ddp->dd_flags = 0;
+	mutex_exit(&ddp->dd_mutex);
+	kmem_cache_free(i_dls_devnet_cachep, ddp);
+
 	return (0);
 }
 
 static int
-dls_devnet_hold(datalink_id_t linkid, dls_devnet_t **ddpp)
+dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp,
+    boolean_t tmp_hold)
 {
 	dls_devnet_t		*ddp;
 	dev_t			phydev = 0;
@@ -938,39 +865,70 @@ dls_devnet_hold(datalink_id_t linkid, dls_devnet_t **ddpp)
 		return (ENOENT);
 	}
 
+	mutex_enter(&ddp->dd_mutex);
 	ASSERT(ddp->dd_ref > 0);
-	ddp->dd_ref++;
+	if (ddp->dd_flags & DD_CONDEMNED) {
+		mutex_exit(&ddp->dd_mutex);
+		rw_exit(&i_dls_devnet_lock);
+		softmac_rele_device(ddh);
+		return (ENOENT);
+	}
+	if (tmp_hold)
+		ddp->dd_tref++;
+	else
+		ddp->dd_ref++;
+	mutex_exit(&ddp->dd_mutex);
 	rw_exit(&i_dls_devnet_lock);
+
 	softmac_rele_device(ddh);
 
-done:
 	*ddpp = ddp;
 	return (0);
 }
 
+int
+dls_devnet_hold(datalink_id_t linkid, dls_devnet_t **ddpp)
+{
+	return (dls_devnet_hold_common(linkid, ddpp, B_FALSE));
+}
+
+/*
+ * Hold the vanity naming structure (dls_devnet_t) temporarily.  The request to
+ * delete the dls_devnet_t will wait until the temporary reference is released.
+ */
+int
+dls_devnet_hold_tmp(datalink_id_t linkid, dls_devnet_t **ddpp)
+{
+	return (dls_devnet_hold_common(linkid, ddpp, B_TRUE));
+}
+
 /*
  * This funtion is called when a DLS client tries to open a device node.
  * This dev_t could a result of a /dev/net node access (returned by
  * devnet_create_rvp->dls_devnet_open()) or a direct /dev node access.
- * In both cases, this function returns 0. In the first case, bump the
- * reference count of the dls_devnet_t structure, so that it will not be
- * freed when devnet_inactive_callback->dls_devnet_close() is called
- * (Note that devnet_inactive_callback() is called right after dld_open,
- * not when the /dev/net access is done). In the second case, ddhp would
- * be NULL.
- *
- * To undo this function, call dls_devnet_close() in the first case, and call
- * dls_vlan_rele() in the second case.
+ * In both cases, this function bumps up the reference count of the
+ * dls_devnet_t structure. The reference is held as long as the device node
+ * is open. In the case of /dev/net while it is true that the initial reference
+ * is held when the devnet_create_rvp->dls_devnet_open call happens, this
+ * initial reference is released immediately in devnet_inactive_callback ->
+ * dls_devnet_close(). (Note that devnet_inactive_callback() is called right
+ * after dld_open completes, not when the /dev/net node is being closed).
+ * To undo this function, call dls_devnet_rele()
  */
 int
-dls_devnet_open_by_dev(dev_t dev, dls_vlan_t **dvpp, dls_dl_handle_t *ddhp)
+dls_devnet_hold_by_dev(dev_t dev, dls_dl_handle_t *ddhp)
 {
+	char			name[MAXNAMELEN];
+	char			*drv;
 	dls_dev_handle_t	ddh = NULL;
-	char			spa[MAXSPALEN];
 	dls_devnet_t		*ddp;
-	dls_vlan_t		*dvp;
 	int			err;
 
+	if ((drv = ddi_major_to_name(getmajor(dev))) == NULL)
+		return (EINVAL);
+
+	(void) snprintf(name, MAXNAMELEN, "%s%d", drv, getminor(dev) - 1);
+
 	/*
 	 * Hold this link to prevent it being detached in case of a
 	 * GLDv3 physical link.
@@ -978,64 +936,49 @@ dls_devnet_open_by_dev(dev_t dev, dls_vlan_t **dvpp, dls_dl_handle_t *ddhp)
 	if (getminor(dev) - 1 < MAC_MAX_MINOR)
 		(void) softmac_hold_device(dev, &ddh);
 
-	/*
-	 * Found the dls_vlan_t with the given dev.
-	 */
-	err = dls_vlan_hold_by_dev(dev, &dvp);
-	softmac_rele_device(ddh);
-
-	if (err != 0)
-		return (err);
-
-	(void) snprintf(spa, MAXSPALEN, "%s/%d",
-	    dvp->dv_dlp->dl_name, dvp->dv_id);
-
 	rw_enter(&i_dls_devnet_lock, RW_WRITER);
 	if ((err = mod_hash_find(i_dls_devnet_hash,
-	    (mod_hash_key_t)spa, (mod_hash_val_t *)&ddp)) != 0) {
+	    (mod_hash_key_t)name, (mod_hash_val_t *)&ddp)) != 0) {
 		ASSERT(err == MH_ERR_NOTFOUND);
 		rw_exit(&i_dls_devnet_lock);
-		*ddhp = NULL;
-		*dvpp = dvp;
-		return (0);
+		softmac_rele_device(ddh);
+		return (ENOENT);
 	}
-
+	mutex_enter(&ddp->dd_mutex);
 	ASSERT(ddp->dd_ref > 0);
+	if (ddp->dd_flags & DD_CONDEMNED) {
+		mutex_exit(&ddp->dd_mutex);
+		rw_exit(&i_dls_devnet_lock);
+		softmac_rele_device(ddh);
+		return (ENOENT);
+	}
 	ddp->dd_ref++;
+	mutex_exit(&ddp->dd_mutex);
 	rw_exit(&i_dls_devnet_lock);
+
+	softmac_rele_device(ddh);
+
 	*ddhp = ddp;
-	*dvpp = dvp;
 	return (0);
 }
 
-static void
+void
 dls_devnet_rele(dls_devnet_t *ddp)
 {
-	rw_enter(&i_dls_devnet_lock, RW_WRITER);
-	ASSERT(ddp->dd_ref != 0);
-	if (--ddp->dd_ref != 0) {
-		rw_exit(&i_dls_devnet_lock);
-		return;
-	}
-	/*
-	 * This should only happen for implicitly-created VLAN.
-	 */
-	ASSERT(ddp->dd_vid != VLAN_ID_NONE);
-	dls_devnet_unset_common(ddp);
-	rw_exit(&i_dls_devnet_lock);
+	mutex_enter(&ddp->dd_mutex);
+	ASSERT(ddp->dd_ref > 1);
+	ddp->dd_ref--;
+	mutex_exit(&ddp->dd_mutex);
 }
 
 static int
-dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
+dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp)
 {
-	char			link_under[MAXLINKNAMELEN];
 	char			drv[MAXLINKNAMELEN];
 	uint_t			ppa;
 	major_t			major;
 	dev_t			phy_dev, tmp_dev;
-	uint_t			vid;
 	datalink_id_t		linkid;
-	dls_devnet_t		*ddp;
 	dls_dev_handle_t	ddh;
 	int			err;
 
@@ -1056,35 +999,8 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
 	if (ddi_parse(link, drv, &ppa) != DDI_SUCCESS)
 		return (ENOENT);
 
-	if ((vid = DLS_PPA2VID(ppa)) > VLAN_ID_MAX)
-		return (ENOENT);
-
-	ppa = (uint_t)DLS_PPA2INST(ppa);
-	(void) snprintf(link_under, sizeof (link_under), "%s%d", drv, ppa);
-
-	if (vid != VLAN_ID_NONE) {
-		/*
-		 * Only global zone can implicitly create a VLAN.
-		 */
-		if (zid != GLOBAL_ZONEID)
-			return (ENOENT);
-
-		/*
-		 * This is potentially an implicitly-created VLAN. Hold the
-		 * link this VLAN is created on.
-		 */
-		if (dls_mgmt_get_linkid(link_under, &linkid) == 0 &&
-		    dls_devnet_hold_tmp(linkid, &ddp) == 0) {
-			if (ddp->dd_vid != VLAN_ID_NONE) {
-				dls_devnet_rele_tmp(ddp);
-				return (ENOENT);
-			}
-			goto implicit;
-		}
-	}
-
 	/*
-	 * If this link (or the link that an implicit vlan is created on)
+	 * If this link:
 	 * (a) is a physical device, (b) this is the first boot, (c) the MAC
 	 * is not registered yet, and (d) we cannot find its linkid, then the
 	 * linkname is the same as the devname.
@@ -1102,7 +1018,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
 	 * At this time, the MAC should be registered, check its phy_dev using
 	 * the given name.
 	 */
-	if ((err = dls_mgmt_get_linkid(link_under, &linkid)) != 0 ||
+	if ((err = dls_mgmt_get_linkid(link, &linkid)) != 0 ||
 	    (err = dls_mgmt_get_phydev(linkid, &tmp_dev)) != 0) {
 		softmac_rele_device(ddh);
 		return (err);
@@ -1112,65 +1028,45 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
 		return (ENOENT);
 	}
 
-	if (vid == VLAN_ID_NONE) {
-		/*
-		 * For non-VLAN, we are done.
-		 */
-		err = dls_devnet_hold(linkid, ddpp);
-		softmac_rele_device(ddh);
-		return (err);
-	}
-
-	/*
-	 * If this is an implicit VLAN, temporarily hold this non-VLAN.
-	 */
-	VERIFY(dls_devnet_hold_tmp(linkid, &ddp) == 0);
+	err = dls_devnet_hold(linkid, ddpp);
 	softmac_rele_device(ddh);
-	ASSERT(ddp->dd_vid == VLAN_ID_NONE);
-
-	/*
-	 * Again, this is potentially an implicitly-created VLAN.
-	 */
-
-implicit:
-	ASSERT(vid != VLAN_ID_NONE);
-	err = dls_devnet_set(ddp->dd_mac, vid, DATALINK_INVALID_LINKID,
-	    linkid, link, ddpp);
-	dls_devnet_rele_tmp(ddp);
 	return (err);
 }
 
-/*
- * Get linkid for the given dev.
- */
 int
-dls_devnet_dev2linkid(dev_t dev, datalink_id_t *linkidp)
+dls_devnet_macname2linkid(const char *macname, datalink_id_t *linkidp)
 {
-	dls_vlan_t	*dvp;
 	dls_devnet_t	*ddp;
-	char		spa[MAXSPALEN];
-	int		err;
-
-	if ((err = dls_vlan_hold_by_dev(dev, &dvp)) != 0)
-		return (err);
-
-	(void) snprintf(spa, MAXSPALEN, "%s/%d",
-	    dvp->dv_dlp->dl_name, dvp->dv_id);
 
 	rw_enter(&i_dls_devnet_lock, RW_READER);
-	if (mod_hash_find(i_dls_devnet_hash, (mod_hash_key_t)spa,
+	if (mod_hash_find(i_dls_devnet_hash, (mod_hash_key_t)macname,
 	    (mod_hash_val_t *)&ddp) != 0) {
 		rw_exit(&i_dls_devnet_lock);
-		dls_vlan_rele(dvp);
 		return (ENOENT);
 	}
 
-	*linkidp = ddp->dd_vlanid;
+	*linkidp = ddp->dd_linkid;
 	rw_exit(&i_dls_devnet_lock);
-	dls_vlan_rele(dvp);
 	return (0);
 }
 
+
+/*
+ * Get linkid for the given dev.
+ */
+int
+dls_devnet_dev2linkid(dev_t dev, datalink_id_t *linkidp)
+{
+	char	macname[MAXNAMELEN];
+	char	*drv;
+
+	if ((drv = ddi_major_to_name(getmajor(dev))) == NULL)
+		return (EINVAL);
+
+	(void) snprintf(macname, MAXNAMELEN, "%s%d", drv, getminor(dev) - 1);
+	return (dls_devnet_macname2linkid(macname, linkidp));
+}
+
 /*
  * Get the link's physical dev_t. It this is a VLAN, get the dev_t of the
  * link this VLAN is created on.
@@ -1213,6 +1109,7 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 	int			err = 0;
 	dev_t			phydev = 0;
 	dls_devnet_t		*ddp;
+	mac_perim_handle_t	mph = NULL;
 	mac_handle_t		mh;
 	mod_hash_val_t		val;
 
@@ -1232,6 +1129,14 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 	if (dls_mgmt_get_phydev(id1, &phydev) == 0)
 		(void) softmac_hold_device(phydev, &ddh);
 
+	/*
+	 * The framework does not hold hold locks across calls to the
+	 * mac perimeter, hence enter the perimeter first. This also waits
+	 * for the property loading to finish.
+	 */
+	if ((err = mac_perim_enter_by_linkid(id1, &mph)) != 0)
+		goto done;
+
 	rw_enter(&i_dls_devnet_lock, RW_WRITER);
 	if ((err = mod_hash_find(i_dls_devnet_id_hash,
 	    (mod_hash_key_t)(uintptr_t)id1, (mod_hash_val_t *)&ddp)) != 0) {
@@ -1241,41 +1146,21 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 	}
 
 	/*
-	 * Let the property loading thread finish.
-	 * Unfortunately, we have to drop i_dls_devnet_lock temporarily
-	 * to avoid deadlocks, and ensure ddp is still in the hash after
-	 * reacquiring it. Observe lock order as well.
-	 */
-	mutex_enter(&ddp->dd_mutex);
-	if (ddp->dd_prop_taskid != NULL) {
-		rw_exit(&i_dls_devnet_lock);
-		while (ddp->dd_prop_taskid != NULL)
-			cv_wait(&ddp->dd_cv, &ddp->dd_mutex);
-		mutex_exit(&ddp->dd_mutex);
-		rw_enter(&i_dls_devnet_lock, RW_WRITER);
-
-		if ((err = mod_hash_find(i_dls_devnet_id_hash,
-		    (mod_hash_key_t)(uintptr_t)id1,
-		    (mod_hash_val_t *)&ddp)) != 0) {
-			ASSERT(err == MH_ERR_NOTFOUND);
-			err = ENOENT;
-			goto done;
-		}
-	} else {
-		mutex_exit(&ddp->dd_mutex);
-	}
-
-	/*
 	 * Return EBUSY if any applications have this link open.
 	 */
-	if ((ddp->dd_explicit && ddp->dd_ref > 1) ||
-	    (!ddp->dd_explicit && ddp->dd_ref > 0)) {
+	if (ddp->dd_ref > 1) {
 		err = EBUSY;
 		goto done;
 	}
 
 	if (id2 == DATALINK_INVALID_LINKID) {
 		(void) strlcpy(linkname, link, sizeof (linkname));
+
+		/* rename mac client name and its flow if exists */
+		if ((err = mac_open(ddp->dd_mac, &mh)) != 0)
+			goto done;
+		(void) mac_rename_primary(mh, link);
+		mac_close(mh);
 		goto done;
 	}
 
@@ -1294,7 +1179,7 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 	/*
 	 * We release the reference of the MAC which mac_open() is
 	 * holding. Note that this mac will not be unregistered
-	 * because the physical device is hold.
+	 * because the physical device is held.
 	 */
 	mac_close(mh);
 
@@ -1302,7 +1187,7 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 	 * Check if there is any other MAC clients, if not, hold this mac
 	 * exclusively until we are done.
 	 */
-	if ((err = mac_hold_exclusive(mh)) != 0)
+	if ((err = mac_mark_exclusive(mh)) != 0)
 		goto done;
 
 	/*
@@ -1310,23 +1195,25 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 	 */
 	if ((err = mod_hash_find(i_dls_devnet_id_hash,
 	    (mod_hash_key_t)(uintptr_t)id2, &val)) != MH_ERR_NOTFOUND) {
-		mac_rele_exclusive(mh);
+		mac_unmark_exclusive(mh);
 		err = EEXIST;
 		goto done;
 	}
 
 	err = dls_mgmt_get_linkinfo(id2, linkname, NULL, NULL, NULL);
 	if (err != 0) {
-		mac_rele_exclusive(mh);
+		mac_unmark_exclusive(mh);
 		goto done;
 	}
 
 	(void) mod_hash_remove(i_dls_devnet_id_hash,
 	    (mod_hash_key_t)(uintptr_t)id1, &val);
 
-	ddp->dd_vlanid = id2;
+	ddp->dd_linkid = id2;
 	(void) mod_hash_insert(i_dls_devnet_id_hash,
-	    (mod_hash_key_t)(uintptr_t)ddp->dd_vlanid, (mod_hash_val_t)ddp);
+	    (mod_hash_key_t)(uintptr_t)ddp->dd_linkid, (mod_hash_val_t)ddp);
+
+	mac_unmark_exclusive(mh);
 
 	/* load properties for new id */
 	mutex_enter(&ddp->dd_mutex);
@@ -1335,8 +1222,6 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 	    dls_devnet_prop_task, ddp, TQ_SLEEP);
 	mutex_exit(&ddp->dd_mutex);
 
-	mac_rele_exclusive(mh);
-
 done:
 	/*
 	 * Change the name of the kstat based on the new link name.
@@ -1345,6 +1230,8 @@ done:
 		dls_devnet_stat_rename(ddp, linkname);
 
 	rw_exit(&i_dls_devnet_lock);
+	if (mph != NULL)
+		mac_perim_exit(mph);
 	softmac_rele_device(ddh);
 	return (err);
 }
@@ -1355,26 +1242,30 @@ dls_devnet_setzid(const char *link, zoneid_t zid)
 	dls_devnet_t	*ddp;
 	int		err;
 	zoneid_t	old_zid;
+	mac_perim_handle_t	mph;
+
+	if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0)
+		return (err);
 
-	if ((err = dls_devnet_hold_by_name(link, &ddp, GLOBAL_ZONEID)) != 0)
+	err = mac_perim_enter_by_macname(ddp->dd_mac, &mph);
+	if (err != 0)
 		return (err);
 
-	mutex_enter(&ddp->dd_zid_mutex);
 	if ((old_zid = ddp->dd_zid) == zid) {
-		mutex_exit(&ddp->dd_zid_mutex);
+		mac_perim_exit(mph);
 		dls_devnet_rele(ddp);
 		return (0);
 	}
 
-	if ((err = dls_vlan_setzid(ddp->dd_mac, ddp->dd_vid, zid)) != 0) {
-		mutex_exit(&ddp->dd_zid_mutex);
+	if ((err = dls_link_setzid(ddp->dd_mac, zid)) != 0) {
+		mac_perim_exit(mph);
 		dls_devnet_rele(ddp);
 		return (err);
 	}
 
 	ddp->dd_zid = zid;
 	devnet_need_rebuild = B_TRUE;
-	mutex_exit(&ddp->dd_zid_mutex);
+	mac_perim_exit(mph);
 
 	/*
 	 * Keep this open reference only if it belonged to the global zone
@@ -1402,9 +1293,7 @@ dls_devnet_getzid(datalink_id_t linkid, zoneid_t *zidp)
 	if ((err = dls_devnet_hold_tmp(linkid, &ddp)) != 0)
 		return (err);
 
-	mutex_enter(&ddp->dd_zid_mutex);
 	*zidp = ddp->dd_zid;
-	mutex_exit(&ddp->dd_zid_mutex);
 
 	dls_devnet_rele_tmp(ddp);
 	return (0);
@@ -1417,13 +1306,16 @@ int
 dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
 {
 	dls_devnet_t	*ddp;
-	dls_vlan_t	*dvp;
+	dls_link_t	*dlp;
 	zoneid_t	zid = getzoneid();
 	int		err;
+	mac_perim_handle_t	mph;
 
-	if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0)
+	if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0)
 		return (err);
 
+	dls_devnet_prop_task_wait(ddp);
+
 	/*
 	 * Opening a link that does not belong to the current non-global zone
 	 * is not allowed.
@@ -1433,16 +1325,22 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
 		return (ENOENT);
 	}
 
-	err = dls_vlan_hold(ddp->dd_mac, ddp->dd_vid, &dvp, B_FALSE, B_TRUE);
+	err = mac_perim_enter_by_macname(ddp->dd_mac, &mph);
 	if (err != 0) {
 		dls_devnet_rele(ddp);
 		return (err);
 	}
 
-	dls_devnet_prop_task_wait(ddp);
+	err = dls_link_hold_create(ddp->dd_mac, &dlp);
+	mac_perim_exit(mph);
+
+	if (err != 0) {
+		dls_devnet_rele(ddp);
+		return (err);
+	}
 
 	*dhp = ddp;
-	*devp = dvp->dv_dev;
+	*devp = dls_link_dev(dlp);
 	return (0);
 }
 
@@ -1453,15 +1351,20 @@ void
 dls_devnet_close(dls_dl_handle_t dlh)
 {
 	dls_devnet_t	*ddp = dlh;
-	dls_vlan_t	*dvp;
+	dls_link_t	*dlp;
+	mac_perim_handle_t	mph;
+
+	VERIFY(mac_perim_enter_by_macname(ddp->dd_mac, &mph) == 0);
+	VERIFY(dls_link_hold(ddp->dd_mac, &dlp) == 0);
 
 	/*
-	 * The VLAN is hold in dls_open_devnet_link().
+	 * One rele for the hold placed in dls_devnet_open, another for
+	 * the hold done just above
 	 */
-	VERIFY((dls_vlan_hold(ddp->dd_mac, ddp->dd_vid, &dvp, B_FALSE,
-	    B_FALSE)) == 0);
-	dls_vlan_rele(dvp);
-	dls_vlan_rele(dvp);
+	dls_link_rele(dlp);
+	dls_link_rele(dlp);
+	mac_perim_exit(mph);
+
 	dls_devnet_rele(ddp);
 }
 
@@ -1481,15 +1384,27 @@ dls_devnet_rebuild()
 int
 dls_devnet_create(mac_handle_t mh, datalink_id_t linkid)
 {
+	dls_link_t	*dlp;
 	int		err;
+	mac_perim_handle_t mph;
 
-	if ((err = dls_vlan_create(mac_name(mh), 0, B_FALSE)) != 0)
-		return (err);
-
-	err = dls_devnet_set(mac_name(mh), 0, linkid, linkid, NULL, NULL);
-	if (err != 0)
-		(void) dls_vlan_destroy(mac_name(mh), 0);
+	mac_perim_enter_by_mh(mh, &mph);
 
+	/*
+	 * Make this association before we call dls_link_hold_create as
+	 * we need to use the linkid to get the user name for the link
+	 * when we create the MAC client.
+	 */
+	if ((err = dls_devnet_set(mac_name(mh), linkid, NULL)) != 0) {
+		mac_perim_exit(mph);
+		return (err);
+	}
+	if ((err = dls_link_hold_create(mac_name(mh), &dlp)) != 0) {
+		(void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE);
+		mac_perim_exit(mph);
+		return (err);
+	}
+	mac_perim_exit(mph);
 	return (err);
 }
 
@@ -1503,134 +1418,39 @@ int
 dls_devnet_recreate(mac_handle_t mh, datalink_id_t linkid)
 {
 	ASSERT(linkid != DATALINK_INVALID_LINKID);
-	return (dls_devnet_set(mac_name(mh), 0, linkid, linkid, NULL, NULL));
+	return (dls_devnet_set(mac_name(mh), linkid, NULL));
 }
 
 int
-dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp)
+dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait)
 {
-	int		err;
+	int			err;
+	mac_perim_handle_t	mph;
 
 	*idp = DATALINK_INVALID_LINKID;
-	err = dls_devnet_unset(mac_name(mh), 0, idp);
+	err = dls_devnet_unset(mac_name(mh), idp, wait);
 	if (err != 0 && err != ENOENT)
 		return (err);
 
-	if ((err = dls_vlan_destroy(mac_name(mh), 0)) == 0)
-		return (0);
-
-	(void) dls_devnet_set(mac_name(mh), 0, *idp, *idp, NULL, NULL);
-	return (err);
-}
+	mac_perim_enter_by_mh(mh, &mph);
+	err = dls_link_rele_by_name(mac_name(mh));
+	mac_perim_exit(mph);
 
-int
-dls_devnet_create_vlan(datalink_id_t vlanid, datalink_id_t linkid,
-    uint16_t vid, boolean_t force)
-{
-	dls_devnet_t	*lnddp, *ddp;
-	dls_vlan_t	*dvp;
-	int		err;
-
-	/*
-	 * Hold the link the VLAN is being created on (which must not be a
-	 * VLAN).
-	 */
-	ASSERT(vid != VLAN_ID_NONE);
-	if ((err = dls_devnet_hold_tmp(linkid, &lnddp)) != 0)
-		return (err);
-
-	if (lnddp->dd_vid != VLAN_ID_NONE) {
-		err = EINVAL;
-		goto done;
-	}
-
-	/*
-	 * A new link.
-	 */
-	err = dls_devnet_set(lnddp->dd_mac, vid, vlanid, linkid, NULL, &ddp);
-	if (err != 0)
-		goto done;
-
-	/*
-	 * Hold the dls_vlan_t (and create it if needed).
-	 */
-	err = dls_vlan_hold(ddp->dd_mac, ddp->dd_vid, &dvp, force, B_TRUE);
-	if (err != 0)
-		VERIFY(dls_devnet_unset(lnddp->dd_mac, vid, NULL) == 0);
+	if (err == 0)
+		return (0);
 
-done:
-	dls_devnet_rele_tmp(lnddp);
+	(void) dls_devnet_set(mac_name(mh), *idp, NULL);
 	return (err);
 }
 
-int
-dls_devnet_destroy_vlan(datalink_id_t vlanid)
-{
-	char		macname[MAXNAMELEN];
-	uint16_t	vid;
-	dls_devnet_t	*ddp;
-	dls_vlan_t	*dvp;
-	int		err;
-
-	if ((err = dls_devnet_hold_tmp(vlanid, &ddp)) != 0)
-		return (err);
-
-	if (ddp->dd_vid == VLAN_ID_NONE) {
-		dls_devnet_rele_tmp(ddp);
-		return (EINVAL);
-	}
-
-	if (!ddp->dd_explicit) {
-		dls_devnet_rele_tmp(ddp);
-		return (EBUSY);
-	}
-
-	(void) strncpy(macname, ddp->dd_mac, MAXNAMELEN);
-	vid = ddp->dd_vid;
-
-	/*
-	 * It is safe to release the temporary reference we just held, as the
-	 * reference from VLAN creation is still held.
-	 */
-	dls_devnet_rele_tmp(ddp);
-
-	if ((err = dls_devnet_unset(macname, vid, NULL)) != 0)
-		return (err);
-
-	/*
-	 * This VLAN has already been held as the result of VLAN creation.
-	 */
-	VERIFY(dls_vlan_hold(macname, vid, &dvp, B_FALSE, B_FALSE) == 0);
-
-	/*
-	 * Release the reference which was held when this VLAN was created,
-	 * and the reference which was just held.
-	 */
-	dls_vlan_rele(dvp);
-	dls_vlan_rele(dvp);
-	return (0);
-}
-
 const char *
 dls_devnet_mac(dls_dl_handle_t ddh)
 {
 	return (ddh->dd_mac);
 }
 
-uint16_t
-dls_devnet_vid(dls_dl_handle_t ddh)
-{
-	return (ddh->dd_vid);
-}
-
 datalink_id_t
 dls_devnet_linkid(dls_dl_handle_t ddh)
 {
 	return (ddh->dd_linkid);
 }
-
-boolean_t
-dls_devnet_is_explicit(dls_dl_handle_t ddh)
-{
-	return (ddh->dd_explicit);
-}
diff --git a/usr/src/uts/common/io/dls/dls_mod.c b/usr/src/uts/common/io/dls/dls_mod.c
index b93befd45c..5f594a0ff9 100644
--- a/usr/src/uts/common/io/dls/dls_mod.c
+++ b/usr/src/uts/common/io/dls/dls_mod.c
@@ -23,18 +23,12 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Data-Link Services Module
  */
 
-#include	<sys/types.h>
 #include	<sys/modctl.h>
-#include	<sys/mac.h>
-
-#include	<sys/dls.h>
-#include	<sys/dls_impl.h>
+#include	<sys/dld_impl.h>
 
 static struct modlmisc		i_dls_modlmisc = {
 	&mod_miscops,
@@ -54,8 +48,6 @@ static struct modlinkage	i_dls_modlinkage = {
 static void
 i_dls_mod_init(void)
 {
-	dls_init();
-	dls_vlan_init();
 	dls_link_init();
 	dls_mgmt_init();
 }
@@ -69,13 +61,6 @@ i_dls_mod_fini(void)
 		return (err);
 
 	dls_mgmt_fini();
-
-	err = dls_vlan_fini();
-	ASSERT(err == 0);
-
-	err = dls_fini();
-	ASSERT(err == 0);
-
 	return (0);
 }
 
diff --git a/usr/src/uts/common/io/dls/dls_soft_ring.c b/usr/src/uts/common/io/dls/dls_soft_ring.c
deleted file mode 100644
index 078b9a9e4c..0000000000
--- a/usr/src/uts/common/io/dls/dls_soft_ring.c
+++ /dev/null
@@ -1,773 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * General Soft rings - Simulating Rx rings in S/W.
- *
- * This is a general purpose high-performance soft ring mechanism. It is
- * similar to a taskq with a single worker thread. The dls creates a
- * set of these rings to simulate the H/W Rx ring (DMA channels) some
- * NICs have. The purpose is to present a common interface to IP
- * so the individual squeues can control these rings and switch them
- * between polling and interrupt mode.
- *
- * This code also serves as a fanout mechanism for fast NIC feeding slow
- * CPU where incoming traffic can be separated into multiple soft rings
- * based on capability negotiation with IP and IP also creates thread
- * affinity to soft ring worker threads to CPU so that conenction to
- * CPU/Squeue affinity is never broken.
- *
- * The soft rings can also be driven by a classifier which can direct
- * traffic to individual soft rings based on the input from IP.
- */
-
-#include <sys/types.h>
-#include <sys/cmn_err.h>
-#include <sys/debug.h>
-#include <sys/kmem.h>
-#include <sys/cpuvar.h>
-#include <sys/condvar_impl.h>
-#include <sys/systm.h>
-#include <sys/callb.h>
-#include <sys/sdt.h>
-#include <sys/ddi.h>
-#include <sys/strsun.h>
-#include <sys/strsubr.h>
-#include <inet/common.h>
-#include <inet/ip.h>
-#include <inet/ipsec_impl.h>
-#include <inet/sadb.h>
-#include <inet/ipsecah.h>
-
-#include <sys/dls_impl.h>
-#include <sys/dls_soft_ring.h>
-
-static void soft_ring_fire(void *);
-static void soft_ring_drain(soft_ring_t *, clock_t);
-static void soft_ring_worker(soft_ring_t *);
-static void soft_ring_stop_workers(soft_ring_t **, int);
-static void dls_taskq_stop_soft_ring(void *);
-
-typedef struct soft_ring_taskq {
-	soft_ring_t	**ringp_list;
-	uint_t		ring_size;
-} soft_ring_taskq_t;
-
-kmem_cache_t *soft_ring_cache;
-
-
-int soft_ring_workerwait_ms = 10;
-int	soft_ring_max_q_cnt = (4 * 1024 * 1024);
-
-/* The values above converted to ticks */
-static int soft_ring_workerwait_tick = 0;
-
-#define	SOFT_RING_WORKER_WAKEUP(ringp) {				\
-	timeout_id_t tid = (ringp)->s_ring_tid;				\
-									\
-	ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock));			\
-	/*								\
-	 * Queue isn't being processed, so take				\
-	 * any post enqueue actions needed before leaving.		\
-	 */								\
-	if (tid != 0) {							\
-		/*							\
-		 * Waiting for an enter() to process mblk(s).		\
-		 */							\
-		clock_t	waited = lbolt - (ringp)->s_ring_awaken;	\
-									\
-		if (TICK_TO_MSEC(waited) >= (ringp)->s_ring_wait) {	\
-			/*						\
-			 * Times up and have a worker thread		\
-			 * waiting for work, so schedule it.		\
-			 */						\
-			(ringp)->s_ring_tid = 0;			\
-			cv_signal(&(ringp)->s_ring_async);		\
-			mutex_exit(&(ringp)->s_ring_lock);		\
-			(void) untimeout(tid);				\
-		} else {       						\
-			mutex_exit(&(ringp)->s_ring_lock);		\
-		}							\
-	} else if ((ringp)->s_ring_wait != 0) {				\
-		(ringp)->s_ring_awaken = lbolt;				\
-		(ringp)->s_ring_tid = timeout(soft_ring_fire, (ringp),	\
-			(ringp)->s_ring_wait);				\
-		mutex_exit(&(ringp)->s_ring_lock);			\
-	} else {							\
-		/*							\
-		 * Schedule the worker thread.				\
-		 */							\
-		cv_signal(&(ringp)->s_ring_async);			\
-		mutex_exit(&(ringp)->s_ring_lock);			\
-	}								\
-	ASSERT(MUTEX_NOT_HELD(&(ringp)->s_ring_lock)); 			\
-}
-
-
-#define	SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt) {			\
-	/*								\
-	 * Enqueue our mblk chain.					\
-	 */								\
-	ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock));			\
-									\
-	if ((ringp)->s_ring_last != NULL)				\
-		(ringp)->s_ring_last->b_next = (mp);			\
-	else								\
-		(ringp)->s_ring_first = (mp);				\
-	(ringp)->s_ring_last = (tail);					\
-	(ringp)->s_ring_count += (cnt);					\
-	ASSERT((ringp)->s_ring_count > 0);				\
-}
-
-void
-soft_ring_init(void)
-{
-	soft_ring_cache = kmem_cache_create("soft_ring_cache",
-	    sizeof (soft_ring_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
-
-	soft_ring_workerwait_tick =
-	    MSEC_TO_TICK_ROUNDUP(soft_ring_workerwait_ms);
-}
-
-/* ARGSUSED */
-soft_ring_t *
-soft_ring_create(char *name, processorid_t bind, clock_t wait,
-    uint_t type, pri_t pri)
-{
-	soft_ring_t *ringp;
-
-	ringp = kmem_cache_alloc(soft_ring_cache, KM_NOSLEEP);
-	if (ringp == NULL)
-		return (NULL);
-
-	bzero(ringp, sizeof (soft_ring_t));
-	(void) strncpy(ringp->s_ring_name, name, S_RING_NAMELEN + 1);
-	ringp->s_ring_name[S_RING_NAMELEN] = '\0';
-	mutex_init(&ringp->s_ring_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	ringp->s_ring_type = type;
-	ringp->s_ring_bind = bind;
-	if (bind != S_RING_BIND_NONE)
-		soft_ring_bind(ringp, bind);
-	ringp->s_ring_wait = MSEC_TO_TICK(wait);
-
-	ringp->s_ring_worker = thread_create(NULL, 0, soft_ring_worker,
-	    ringp, 0, &p0, TS_RUN, pri);
-
-	return (ringp);
-}
-
-soft_ring_t **
-soft_ring_set_create(char *name, processorid_t bind, clock_t wait,
-    uint_t type, pri_t pri, int ring_size)
-{
-	int 		i;
-	soft_ring_t	**ringp_list;
-
-	if ((ringp_list =
-	    (soft_ring_t **) kmem_zalloc(sizeof (soft_ring_t *) * ring_size,
-	    KM_NOSLEEP)) != NULL) {
-		for (i = 0; i < ring_size; i++) {
-			ringp_list[i] = soft_ring_create(name, bind, wait,
-			    type, pri);
-			if (ringp_list[i] == NULL)
-				break;
-		}
-		if (i != ring_size) {
-			soft_ring_stop_workers(ringp_list, ring_size);
-			soft_ring_set_destroy(ringp_list, ring_size);
-			ringp_list = NULL;
-		}
-	}
-	return (ringp_list);
-}
-
-static void
-soft_ring_stop_workers(soft_ring_t **ringp_set, int ring_size)
-{
-	int 		i;
-	soft_ring_t	*ringp;
-	timeout_id_t 	tid;
-	kt_did_t	t_did = 0;
-
-	for (i = 0; (i < ring_size) && (ringp_set[i] != NULL); i++) {
-		ringp = ringp_set[i];
-
-		soft_ring_unbind((void *)ringp);
-		mutex_enter(&ringp->s_ring_lock);
-		if ((tid = ringp->s_ring_tid) != 0)
-			(void) untimeout(tid);
-
-		ringp->s_ring_tid = 0;
-
-		if (!(ringp->s_ring_state & S_RING_DEAD)) {
-			ringp->s_ring_state |= S_RING_DESTROY;
-			t_did = ringp->s_ring_worker->t_did;
-
-
-			/* Wake the worker so it can exit */
-			cv_signal(&(ringp)->s_ring_async);
-		}
-		mutex_exit(&ringp->s_ring_lock);
-
-		/*
-		 * Here comes the tricky part. IP and driver ensure
-		 * that packet flow has stopped but worker thread
-		 * might still be draining the soft ring. We have
-		 * already set the S_RING_DESTROY flag. We wait till
-		 * the worker thread takes notice and stops processing
-		 * the soft_ring and exits. It sets S_RING_DEAD on
-		 * exiting.
-		 */
-		if (t_did)
-			thread_join(t_did);
-	}
-}
-
-void
-soft_ring_set_destroy(soft_ring_t **ringp_set, int ring_size)
-{
-	int		i;
-	mblk_t		*mp;
-	soft_ring_t	*ringp;
-
-	for (i = 0; (i < ring_size) && (ringp_set[i] != NULL); i++) {
-		ringp = ringp_set[i];
-
-		mutex_enter(&ringp->s_ring_lock);
-
-		ASSERT(ringp->s_ring_state & S_RING_DEAD);
-
-		while ((mp = ringp->s_ring_first) != NULL) {
-			ringp->s_ring_first = mp->b_next;
-			mp->b_next = NULL;
-			freemsg(mp);
-		}
-		ringp->s_ring_last = NULL;
-		mutex_exit(&ringp->s_ring_lock);
-
-		/*
-		 * IP/driver ensure that no packets are flowing
-		 * when we are destroying the soft rings otherwise bad
-		 * things will happen.
-		 */
-		kmem_cache_free(soft_ring_cache, ringp);
-		ringp_set[i] = NULL;
-	}
-	kmem_free(ringp_set, sizeof (soft_ring_t *) * ring_size);
-}
-
-/* ARGSUSED */
-void
-soft_ring_bind(void *arg, processorid_t bind)
-{
-	cpu_t	*cp;
-	soft_ring_t *ringp = (soft_ring_t *)arg;
-
-	mutex_enter(&ringp->s_ring_lock);
-	if (ringp->s_ring_state & S_RING_BOUND) {
-		mutex_exit(&ringp->s_ring_lock);
-		return;
-	}
-
-	ringp->s_ring_state |= S_RING_BOUND;
-	ringp->s_ring_bind = bind;
-	mutex_exit(&ringp->s_ring_lock);
-
-	cp = cpu[bind];
-	mutex_enter(&cpu_lock);
-	if (cpu_is_online(cp)) {
-		thread_affinity_set(ringp->s_ring_worker, ringp->s_ring_bind);
-	}
-	mutex_exit(&cpu_lock);
-}
-
-void
-soft_ring_unbind(void *arg)
-{
-	soft_ring_t *ringp = (soft_ring_t *)arg;
-
-	mutex_enter(&ringp->s_ring_lock);
-	if (!(ringp->s_ring_state & S_RING_BOUND)) {
-		mutex_exit(&ringp->s_ring_lock);
-		return;
-	}
-
-	ringp->s_ring_state &= ~S_RING_BOUND;
-	ringp->s_ring_bind = S_RING_BIND_NONE;
-	mutex_exit(&ringp->s_ring_lock);
-
-	thread_affinity_clear(ringp->s_ring_worker);
-}
-
-/*
- * soft_ring_enter() - enter soft_ring sqp with mblk mp (which can be
- * a chain), while tail points to the end and cnt in number of
- * mblks in the chain.
- *
- * For a chain of single packet (i.e. mp == tail), go through the
- * fast path if no one is processing the soft_ring and nothing is queued.
- *
- * The proc and arg for each mblk is already stored in the mblk in
- * appropriate places.
- */
-/* ARGSUSED */
-static void
-soft_ring_process(soft_ring_t *ringp,
-    mblk_t *mp_chain, mblk_t *tail, uint_t count)
-{
-	void 		*arg1, *arg2;
-	s_ring_proc_t	proc;
-
-	ASSERT(ringp != NULL);
-	ASSERT(mp_chain != NULL);
-	ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
-
-	mutex_enter(&ringp->s_ring_lock);
-
-	ringp->s_ring_total_inpkt += count;
-	if (!(ringp->s_ring_state & S_RING_PROC) &&
-	    !(ringp->s_ring_type == S_RING_WORKER_ONLY)) {
-		/*
-		 * See if anything is already queued. If we are the
-		 * first packet, do inline processing else queue the
-		 * packet and do the drain.
-		 */
-		if (ringp->s_ring_first == NULL && count == 1) {
-			/*
-			 * Fast-path, ok to process and nothing queued.
-			 */
-			ringp->s_ring_run = curthread;
-			ringp->s_ring_state |= (S_RING_PROC);
-
-			/*
-			 * We are the chain of 1 packet so
-			 * go through this fast path.
-			 */
-			ASSERT(mp_chain->b_next == NULL);
-			proc = ringp->s_ring_upcall;
-			arg1 = ringp->s_ring_upcall_arg1;
-			arg2 = ringp->s_ring_upcall_arg2;
-
-			mutex_exit(&ringp->s_ring_lock);
-			(*proc)(arg1, arg2, mp_chain, NULL);
-
-			ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
-			mutex_enter(&ringp->s_ring_lock);
-			ringp->s_ring_run = NULL;
-			ringp->s_ring_state &= ~S_RING_PROC;
-			if (ringp->s_ring_first == NULL) {
-				/*
-				 * We processed inline our packet and
-				 * nothing new has arrived. We are done.
-				 */
-				mutex_exit(&ringp->s_ring_lock);
-				return;
-			}
-		} else {
-			SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, count);
-		}
-
-		/*
-		 * We are here because either we couldn't do inline
-		 * processing (because something was already queued),
-		 * or we had a chanin of more than one packet,
-		 * or something else arrived after we were done with
-		 * inline processing.
-		 */
-		ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
-		ASSERT(ringp->s_ring_first != NULL);
-
-
-		soft_ring_drain(ringp, -1);
-		mutex_exit(&ringp->s_ring_lock);
-		return;
-	} else {
-		/*
-		 * Queue is already being processed. Just enqueue
-		 * the packet and go away.
-		 */
-		if (ringp->s_ring_count > soft_ring_max_q_cnt) {
-			freemsgchain(mp_chain);
-			DLS_BUMP_STAT(dlss_soft_ring_pkt_drop, count);
-		} else
-			SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, count);
-		if (!(ringp->s_ring_state & S_RING_PROC)) {
-			SOFT_RING_WORKER_WAKEUP(ringp);
-		} else {
-			ASSERT(ringp->s_ring_run != NULL);
-			mutex_exit(&ringp->s_ring_lock);
-		}
-		return;
-	}
-}
-
-/*
- * PRIVATE FUNCTIONS
- */
-
-static void
-soft_ring_fire(void *arg)
-{
-	soft_ring_t	*ringp = arg;
-
-	mutex_enter(&ringp->s_ring_lock);
-	if (ringp->s_ring_tid == 0) {
-		mutex_exit(&ringp->s_ring_lock);
-		return;
-	}
-
-	ringp->s_ring_tid = 0;
-
-	if (!(ringp->s_ring_state & S_RING_PROC)) {
-		cv_signal(&ringp->s_ring_async);
-	}
-	mutex_exit(&ringp->s_ring_lock);
-}
-
-/* ARGSUSED */
-static void
-soft_ring_drain(soft_ring_t *ringp, clock_t expire)
-{
-	mblk_t		*mp;
-	s_ring_proc_t 	proc;
-	void		*arg1, *arg2;
-	timeout_id_t 	tid;
-
-	ringp->s_ring_run = curthread;
-	ASSERT(mutex_owned(&ringp->s_ring_lock));
-	ASSERT(!(ringp->s_ring_state & S_RING_PROC));
-
-	if ((tid = ringp->s_ring_tid) != 0)
-		ringp->s_ring_tid = 0;
-
-	ringp->s_ring_state |= S_RING_PROC;
-
-
-	proc = ringp->s_ring_upcall;
-	arg1 = ringp->s_ring_upcall_arg1;
-	arg2 = ringp->s_ring_upcall_arg2;
-
-	while (ringp->s_ring_first != NULL) {
-		mp = ringp->s_ring_first;
-		ringp->s_ring_first = NULL;
-		ringp->s_ring_last = NULL;
-		ringp->s_ring_count = 0;
-		mutex_exit(&ringp->s_ring_lock);
-
-		if (tid != 0) {
-			(void) untimeout(tid);
-			tid = 0;
-		}
-
-		(*proc)(arg1, arg2, mp, NULL);
-
-		mutex_enter(&ringp->s_ring_lock);
-	}
-
-	ringp->s_ring_state &= ~S_RING_PROC;
-	ringp->s_ring_run = NULL;
-}
-
-static void
-soft_ring_worker(soft_ring_t *ringp)
-{
-	kmutex_t *lock = &ringp->s_ring_lock;
-	kcondvar_t *async = &ringp->s_ring_async;
-	callb_cpr_t cprinfo;
-
-	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "soft_ring");
-	mutex_enter(lock);
-
-	for (;;) {
-		while (ringp->s_ring_first == NULL ||
-		    (ringp->s_ring_state & S_RING_PROC)) {
-			CALLB_CPR_SAFE_BEGIN(&cprinfo);
-			if (ringp->s_ring_state & S_RING_DESTROY)
-				goto destroy;
-still_wait:
-			cv_wait(async, lock);
-			if (ringp->s_ring_state & S_RING_DESTROY) {
-destroy:
-				if (ringp->s_ring_state & S_RING_DESTROY) {
-					ringp->s_ring_state |= S_RING_DEAD;
-					CALLB_CPR_EXIT(&cprinfo);
-					thread_exit();
-				}
-			}
-			if (ringp->s_ring_state & S_RING_PROC) {
-				goto still_wait;
-			}
-			CALLB_CPR_SAFE_END(&cprinfo, lock);
-		}
-		soft_ring_drain(ringp, -1);
-	}
-}
-
-void
-dls_soft_ring_disable(dls_channel_t dc)
-{
-	dls_impl_t	*dip = (dls_impl_t *)dc;
-	soft_ring_t	**ringp_list = NULL;
-	int		ring_size;
-
-	rw_enter(&(dip->di_lock), RW_READER);
-	if (dip->di_soft_ring_list != NULL) {
-		ringp_list = dip->di_soft_ring_list;
-		ring_size = dip->di_soft_ring_size;
-	}
-	rw_exit(&(dip->di_lock));
-
-	if (ringp_list != NULL)
-		soft_ring_stop_workers(ringp_list, ring_size);
-}
-
-static void
-dls_taskq_stop_soft_ring(void *arg)
-{
-	soft_ring_taskq_t	*ring_taskq;
-	soft_ring_t		**ringp_list;
-	int			ring_size;
-
-	ring_taskq = (soft_ring_taskq_t *)arg;
-	ringp_list = ring_taskq->ringp_list;
-	ring_size = ring_taskq->ring_size;
-	kmem_free(ring_taskq, sizeof (soft_ring_taskq_t));
-
-	soft_ring_stop_workers(ringp_list, ring_size);
-	soft_ring_set_destroy(ringp_list, ring_size);
-}
-
-boolean_t
-dls_soft_ring_enable(dls_channel_t dc, dl_capab_dls_t *soft_ringp)
-{
-	dls_impl_t		*dip;
-	int			i;
-	soft_ring_t		**softring_set;
-	soft_ring_t		*softring;
-	mac_rx_fifo_t		mrf;
-	soft_ring_taskq_t	*ring_taskq;
-	char			name[64];
-
-	dip = (dls_impl_t *)dc;
-
-	rw_enter(&(dip->di_lock), RW_WRITER);
-
-	if (dip->di_soft_ring_list != NULL) {
-		/*
-		 * Both ds_lock and di_lock are held as writer.
-		 * As soft_ring_stop_workers() blocks for the
-		 * worker thread(s) to complete, there is a possibility
-		 * that the worker thread(s) could be in the process
-		 * of draining the queue and is blocked waiting for
-		 * either ds_lock or di_lock. Moreover the NIC interrupt
-		 * thread could be blocked in dls_accept().
-		 * To avoid deadlock condition, taskq thread would be
-		 * created to handle soft_ring_stop_workers() and
-		 * blocking if required which would avoid holding
-		 * both ds_lock and di_lock.
-		 * NOTE: we cannot drop either locks here, due to
-		 * weird race conditions seen.
-		 */
-		ring_taskq = (soft_ring_taskq_t *)
-		    kmem_zalloc(sizeof (soft_ring_taskq_t), KM_NOSLEEP);
-		if (ring_taskq == NULL) {
-			rw_exit(&(dip->di_lock));
-			return (B_FALSE);
-		}
-		ring_taskq->ringp_list = dip->di_soft_ring_list;
-		ring_taskq->ring_size = dip->di_soft_ring_size;
-		if (taskq_dispatch(system_taskq, dls_taskq_stop_soft_ring,
-		    ring_taskq, TQ_NOSLEEP) == NULL) {
-			rw_exit(&(dip->di_lock));
-			kmem_free(ring_taskq, sizeof (soft_ring_taskq_t));
-			return (B_FALSE);
-		}
-		dip->di_soft_ring_list = NULL;
-	}
-	dip->di_soft_ring_size = 0;
-
-	bzero(name, sizeof (name));
-	(void) snprintf(name, sizeof (name), "dls_soft_ring_%p", (void *)dip);
-	dip->di_soft_ring_list = soft_ring_set_create(name, S_RING_BIND_NONE,
-	    0, S_RING_WORKER_ONLY, minclsyspri, soft_ringp->dls_ring_cnt);
-
-	if (dip->di_soft_ring_list == NULL) {
-		rw_exit(&(dip->di_lock));
-		return (B_FALSE);
-	}
-
-	dip->di_soft_ring_size = soft_ringp->dls_ring_cnt;
-	softring_set = dip->di_soft_ring_list;
-
-	dip->di_ring_add = (mac_resource_add_t)soft_ringp->dls_ring_add;
-	dip->di_rx = (dls_rx_t)soft_ringp->dls_ring_assign;
-	dip->di_rx_arg = (void *)soft_ringp->dls_rx_handle;
-
-	bzero(&mrf, sizeof (mac_rx_fifo_t));
-	mrf.mrf_type = MAC_RX_FIFO;
-	for (i = 0; i < soft_ringp->dls_ring_cnt; i++) {
-		softring = softring_set[i];
-		mrf.mrf_arg = softring;
-		softring->s_ring_upcall_arg1 =
-		    (void *)soft_ringp->dls_rx_handle;
-		softring->s_ring_upcall_arg2 =
-		    dip->di_ring_add((void *)soft_ringp->dls_rx_handle,
-		    (mac_resource_t *)&mrf);
-		softring->s_ring_upcall =
-		    (s_ring_proc_t)soft_ringp->dls_rx;
-	}
-
-	/*
-	 * Note that soft_ring is enabled. This prevents further DLIOCHDRINFO
-	 * ioctls from overwriting the receive function pointer.
-	 */
-	rw_exit(&(dip->di_lock));
-	return (B_TRUE);
-}
-
-int dls_bad_ip_pkt = 0;
-
-static mblk_t *
-dls_skip_mblk(mblk_t *bp, mblk_t *mp, int *skip_lenp)
-{
-	while (MBLKL(bp) <= *skip_lenp) {
-		*skip_lenp -= MBLKL(bp);
-		bp = bp->b_cont;
-		if (bp == NULL) {
-			dls_bad_ip_pkt++;
-			freemsg(mp);
-			return (NULL);
-		}
-	}
-	return (bp);
-}
-
-#define	HASH32(x) (((x) >> 24) ^ ((x) >> 16) ^ ((x) >> 8) ^ (x))
-#define	COMPUTE_INDEX(key, sz)	(key % sz)
-
-/*
- * dls_soft_ring_fanout():
- */
-/* ARGSUSED */
-void
-dls_soft_ring_fanout(void *rx_handle, void *rx_cookie, mblk_t *mp_chain,
-    mac_header_info_t *mhip)
-{
-	mblk_t		*mp, *bp, *head, *tail;
-	ipha_t		*ipha;
-	dls_impl_t	*dip = (dls_impl_t *)rx_handle;
-	int		indx, saved_indx;
-	int		hash = 0;
-	int		skip_len;
-	uint8_t		protocol;
-	int		count = 0;
-
-	head = tail = NULL;
-
-	while (mp_chain != NULL) {
-		bp = mp = mp_chain;
-		mp_chain = mp_chain->b_next;
-		mp->b_next = NULL;
-		if ((MBLKL(mp) < sizeof (ipha_t)) || !OK_32PTR(mp->b_rptr)) {
-			mp = msgpullup(bp, sizeof (ipha_t));
-			freemsg(bp);
-			if (mp == NULL) {
-				dls_bad_ip_pkt++;
-				continue;
-			}
-			bp = mp;
-		}
-
-		ipha = (ipha_t *)mp->b_rptr;
-		skip_len = IPH_HDR_LENGTH(ipha);
-		protocol = ipha->ipha_protocol;
-	again:
-		switch (protocol) {
-		case IPPROTO_TCP:
-		case IPPROTO_UDP:
-		case IPPROTO_SCTP:
-		case IPPROTO_ESP:
-			/*
-			 * Note that for ESP, we fanout on SPI and it is at the
-			 * same offset as the 2x16-bit ports. So it is clumped
-			 * along with TCP, UDP and SCTP.
-			 */
-			if (MBLKL(bp) <= skip_len) {
-				bp = dls_skip_mblk(bp, mp, &skip_len);
-				if (bp == NULL)
-					continue;
-			}
-
-			hash = HASH32(*(uint32_t *)(bp->b_rptr + skip_len));
-			break;
-
-		case IPPROTO_AH: {
-			ah_t *ah;
-			uint_t ah_length;
-
-			if (MBLKL(bp) <= skip_len) {
-				bp = dls_skip_mblk(bp, mp, &skip_len);
-				if (bp == NULL)
-					continue;
-			}
-
-			ah = (ah_t *)(bp->b_rptr + skip_len);
-			protocol = ah->ah_nexthdr;
-			ah_length = AH_TOTAL_LEN(ah);
-			skip_len += ah_length;
-			goto again;
-		}
-
-		default:
-			/*
-			 * Send the packet to a ring based on src/dest addresses
-			 */
-			hash =
-			    (HASH32(ipha->ipha_src) ^ HASH32(ipha->ipha_dst));
-			break;
-		}
-
-		indx = COMPUTE_INDEX(hash, dip->di_soft_ring_size);
-		if (head == NULL) {
-			saved_indx = indx;
-			head = tail = mp;
-			count++;
-		} else if (indx == saved_indx) {
-			tail->b_next = mp;
-			tail = mp;
-			count++;
-		} else {
-			soft_ring_process(dip->di_soft_ring_list[saved_indx],
-			    head, tail, count);
-			head = tail = mp;
-			saved_indx = indx;
-			count = 1;
-		}
-	}
-	if (head != NULL)
-		soft_ring_process(dip->di_soft_ring_list[saved_indx],
-		    head, tail, count);
-}
diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c
index 99f41d0c7d..a6f89a8b49 100644
--- a/usr/src/uts/common/io/dls/dls_stat.c
+++ b/usr/src/uts/common/io/dls/dls_stat.c
@@ -23,22 +23,12 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Data-Link Services Module
  */
 
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-#include <sys/atomic.h>
-#include <sys/kstat.h>
-#include <sys/vlan.h>
-#include <sys/mac.h>
+#include <sys/dld_impl.h>
 #include <sys/mac_ether.h>
-#include <sys/ctype.h>
-#include <sys/dls.h>
-#include <sys/dls_impl.h>
 
 static mac_stat_info_t	i_dls_si[] = {
 	{ MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 },
@@ -66,35 +56,18 @@ static mac_stat_info_t	i_dls_si[] = {
 #define	STAT_INFO_COUNT	(sizeof (i_dls_si) / sizeof (i_dls_si[0]))
 
 /*
- * Private functions.
- */
-
-static int
-i_dls_mac_stat_update(kstat_t *ksp, int rw)
-{
-	dls_vlan_t	*dvp = ksp->ks_private;
-
-	return (dls_stat_update(ksp, dvp, rw));
-}
-
-/*
  * Exported functions.
  */
 int
-dls_stat_update(kstat_t *ksp, dls_vlan_t *dvp, int rw)
+dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
 {
-	dls_link_t	*dlp = dvp->dv_dlp;
 	kstat_named_t	*knp;
 	uint_t		i;
 	uint64_t	val;
-	int		err;
 
 	if (rw != KSTAT_READ)
 		return (EACCES);
 
-	if ((err = dls_mac_hold(dlp)) != 0)
-		return (err);
-
 	knp = (kstat_named_t *)ksp->ks_data;
 	for (i = 0; i < STAT_INFO_COUNT; i++) {
 		val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat);
@@ -124,7 +97,6 @@ dls_stat_update(kstat_t *ksp, dls_vlan_t *dvp, int rw)
 	}
 	knp++;
 	knp->value.ui32 = dlp->dl_unknowns;
-	dls_mac_rele(dlp);
 
 	return (0);
 }
@@ -158,45 +130,3 @@ dls_stat_create(const char *module, int instance, const char *name,
 	*kspp = ksp;
 	return (0);
 }
-
-void
-dls_mac_stat_create(dls_vlan_t *dvp)
-{
-	kstat_t		*ksp = NULL;
-	major_t		major;
-
-	/*
-	 * Create the legacy kstats to provide backward compatibility.
-	 * These kstats need to be created even when this link does not
-	 * have a link name, i.e., when the VLAN is accessed using its
-	 * /dev node.
-	 *
-	 * Note that we only need to create the legacy kstats for GLDv3
-	 * physical links, aggregation links which are created using
-	 * the 'key' option, and any VLAN links created over them.
-	 * This can be determined by checking its dv_ppa.
-	 */
-	ASSERT(dvp->dv_ksp == NULL);
-	if (dvp->dv_ppa >= MAC_MAX_MINOR)
-		return;
-
-	major = getmajor(dvp->dv_dev);
-	ASSERT(GLDV3_DRV(major) && (dvp->dv_ksp == NULL));
-
-	if (dls_stat_create(ddi_major_to_name(major),
-	    dvp->dv_id * 1000 + dvp->dv_ppa, NULL,
-	    i_dls_mac_stat_update, dvp, &ksp) != 0) {
-		return;
-	}
-	ASSERT(ksp != NULL);
-	dvp->dv_ksp = ksp;
-}
-
-void
-dls_mac_stat_destroy(dls_vlan_t *dvp)
-{
-	if (dvp->dv_ksp != NULL) {
-		kstat_delete(dvp->dv_ksp);
-		dvp->dv_ksp = NULL;
-	}
-}
diff --git a/usr/src/uts/common/io/dls/dls_vlan.c b/usr/src/uts/common/io/dls/dls_vlan.c
deleted file mode 100644
index 9df000e86a..0000000000
--- a/usr/src/uts/common/io/dls/dls_vlan.c
+++ /dev/null
@@ -1,561 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * Data-Link Services Module
- */
-
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-#include <sys/modhash.h>
-#include <sys/stat.h>
-#include <sys/kstat.h>
-#include <sys/vlan.h>
-#include <sys/mac.h>
-#include <sys/ctype.h>
-#include <sys/dls.h>
-#include <sys/dls_impl.h>
-
-static kmem_cache_t	*i_dls_vlan_cachep;
-static mod_hash_t	*i_dls_vlan_hash;
-static mod_hash_t	*i_dls_vlan_dev_hash;
-static krwlock_t	i_dls_vlan_lock;
-static uint_t		i_dls_vlan_count;
-
-#define	VLAN_HASHSZ	67	/* prime */
-
-/*
- * Private functions.
- */
-
-/*ARGSUSED*/
-static int
-i_dls_vlan_constructor(void *buf, void *arg, int kmflag)
-{
-	dls_vlan_t	*dvp = buf;
-
-	bzero(buf, sizeof (dls_vlan_t));
-	mutex_init(&dvp->dv_lock, NULL, MUTEX_DEFAULT, NULL);
-	return (0);
-}
-
-/*ARGSUSED*/
-static void
-i_dls_vlan_destructor(void *buf, void *arg)
-{
-	dls_vlan_t	*dvp = buf;
-
-	ASSERT(dvp->dv_ref == 0);
-	ASSERT(dvp->dv_zone_ref == 0);
-	mutex_destroy(&dvp->dv_lock);
-}
-
-/*
- * Module initialization functions.
- */
-void
-dls_vlan_init(void)
-{
-	/*
-	 * Create a kmem_cache of dls_vlan_t structures.
-	 */
-	i_dls_vlan_cachep = kmem_cache_create("dls_vlan_cache",
-	    sizeof (dls_vlan_t), 0, i_dls_vlan_constructor,
-	    i_dls_vlan_destructor, NULL, NULL, NULL, 0);
-	ASSERT(i_dls_vlan_cachep != NULL);
-
-	/*
-	 * Create a hash table, keyed by dv_spa, of dls_vlan_t.
-	 */
-	i_dls_vlan_hash = mod_hash_create_extended("dls_vlan_hash",
-	    VLAN_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
-	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
-
-	/*
-	 * Create a hash table, keyed by dv_dev, of dls_vlan_t.
-	 */
-	i_dls_vlan_dev_hash = mod_hash_create_ptrhash("dls_vlan_dev_hash",
-	    VLAN_HASHSZ, mod_hash_null_valdtor, sizeof (dev_t));
-
-	rw_init(&i_dls_vlan_lock, NULL, RW_DEFAULT, NULL);
-	i_dls_vlan_count = 0;
-}
-
-int
-dls_vlan_fini(void)
-{
-	if (i_dls_vlan_count > 0)
-		return (EBUSY);
-
-	/*
-	 * Destroy the hash table
-	 */
-	mod_hash_destroy_hash(i_dls_vlan_hash);
-	mod_hash_destroy_hash(i_dls_vlan_dev_hash);
-	rw_destroy(&i_dls_vlan_lock);
-
-	/*
-	 * Destroy the kmem_cache.
-	 */
-	kmem_cache_destroy(i_dls_vlan_cachep);
-	return (0);
-}
-
-/*
- * Exported functions.
- */
-
-/*
- * If vid is VLAN_ID_NONE, then the minor_t to access this dls_vlan_t is
- * ppa + 1, otherwise, we need to allocate the minor_t in this function.
- *
- * If ppa is greater than DLS_MAX_PPA, it means that we do not need to create
- * the VLAN minor node for this MAC, as this MAC is (a) a legacy device, (b)
- * an aggr created without the "key" argument, or (c) a new type of link
- * whose ppa is allocated by mac_minor_hold() in mac_register().
- */
-int
-dls_vlan_create(const char *macname, uint16_t vid, boolean_t force)
-{
-	char		node[MAXPATHLEN];
-	char		spa[MAXSPALEN];
-	char		*driver;
-	dls_link_t	*dlp;
-	dls_vlan_t	*dvp;
-	minor_t		minor = 0;
-	mac_handle_t	mh;
-	int		ppa;
-	dev_info_t	*dip;
-	uint32_t	margin = VLAN_TAGSZ;
-	int		err = 0;
-
-	if ((err = mac_open(macname, &mh)) != 0)
-		return (err);
-
-	/*
-	 * First check whether VLANs are able to be created on this MAC.
-	 */
-	if (vid != VLAN_ID_NONE) {
-		if ((mac_info(mh)->mi_media != DL_ETHER) ||
-		    (mac_info(mh)->mi_nativemedia != DL_ETHER)) {
-			mac_close(mh);
-			return (EINVAL);
-		}
-		if (!force &&
-		    ((err = mac_margin_add(mh, &margin, B_FALSE)) != 0)) {
-			mac_close(mh);
-			return (err);
-		}
-	}
-
-	/*
-	 * Get a reference to a dls_link_t representing the MAC. This call
-	 * will create one if necessary.
-	 */
-	if ((err = dls_link_hold(macname, &dlp)) != 0) {
-		if (vid != VLAN_ID_NONE && !force)
-			VERIFY(mac_margin_remove(mh, margin) == 0);
-		mac_close(mh);
-		return (err);
-	}
-
-	rw_enter(&i_dls_vlan_lock, RW_WRITER);
-
-	/*
-	 * Try to find this VLAN in i_dls_vlan_hash first. The spa
-	 * is in the <macname/vid> form.
-	 */
-	(void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid);
-	if ((err = mod_hash_find(i_dls_vlan_hash,
-	    (mod_hash_key_t)spa, (mod_hash_val_t)&dvp)) == 0) {
-		err = EEXIST;
-		goto fail;
-	}
-
-	ppa = mac_minor(mh) - 1;
-	dip = mac_devinfo_get(mh);
-
-	if (vid == VLAN_ID_NONE) {
-		/*
-		 * Derives minor number directly from non-VLAN link's PPA.
-		 */
-		minor = ppa + 1;
-	} else if ((minor = mac_minor_hold(B_TRUE)) == 0) {
-		/*
-		 * Allocate minor number from minor_arenap for VLANs.
-		 */
-		err = ENOMEM;
-		goto fail;
-	}
-
-	/*
-	 * First create its minor node for non-legacy links, including VLANs
-	 * and non-VLANs. This is for /dev nodes backward compatibility.
-	 */
-	if (vid != VLAN_ID_NONE && ppa < MAC_MAX_MINOR) {
-
-		driver = (char *)ddi_driver_name(dip);
-
-		/* Create a style-1 DLPI device */
-		(void) snprintf(node, MAXPATHLEN, "%s%d", driver,
-		    vid * 1000 + ppa);
-		if (ddi_create_minor_node(dip, node, S_IFCHR, minor,
-		    DDI_NT_NET, 0) != DDI_SUCCESS) {
-			err = EINVAL;
-			goto fail;
-		}
-	}
-
-	dvp = kmem_cache_alloc(i_dls_vlan_cachep, KM_SLEEP);
-	dvp->dv_id = vid;
-	dvp->dv_dlp = dlp;
-	dvp->dv_dev = makedevice(ddi_driver_major(dip), minor);
-	dvp->dv_dip = dip;
-	dvp->dv_ppa = ppa;
-	dvp->dv_force = force;
-	dvp->dv_ref = 0;
-	dvp->dv_zone_ref = 0;
-	dvp->dv_zid = GLOBAL_ZONEID;
-	(void) strlcpy(dvp->dv_spa, spa, MAXSPALEN);
-	dls_mac_stat_create(dvp);
-
-	err = mod_hash_insert(i_dls_vlan_hash,
-	    (mod_hash_key_t)dvp->dv_spa, (mod_hash_val_t)dvp);
-	ASSERT(err == 0);
-
-	err = mod_hash_insert(i_dls_vlan_dev_hash,
-	    (mod_hash_key_t)dvp->dv_dev, (mod_hash_val_t)dvp);
-	ASSERT(err == 0);
-
-	i_dls_vlan_count++;
-	rw_exit(&i_dls_vlan_lock);
-
-	/*
-	 * Hold the underlying MAC for VLANs to keep the margin request.
-	 * We cannot hold the mac for non-VLANs, because a reference would
-	 * prevent the device from detaching.
-	 */
-	if (vid != VLAN_ID_NONE)
-		VERIFY(dls_mac_hold(dvp->dv_dlp) == 0);
-
-	mac_close(mh);
-	return (0);
-
-fail:
-	rw_exit(&i_dls_vlan_lock);
-	if (vid != VLAN_ID_NONE && minor != 0)
-		mac_minor_rele(minor);
-	dls_link_rele(dlp);
-	if (vid != VLAN_ID_NONE && !force)
-		VERIFY(mac_margin_remove(mh, margin) == 0);
-	mac_close(mh);
-	return (err);
-}
-
-int
-dls_vlan_destroy(const char *macname, uint16_t vid)
-{
-	char		spa[MAXSPALEN];
-	dls_vlan_t	*dvp;
-	mod_hash_val_t	val;
-	int		err;
-
-	/*
-	 * Try to find this VLAN in i_dls_vlan_hash first. The spa
-	 * is in the <macname/vid> form.
-	 */
-	(void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid);
-
-	rw_enter(&i_dls_vlan_lock, RW_WRITER);
-
-	if ((err = mod_hash_find(i_dls_vlan_hash,
-	    (mod_hash_key_t)spa, (mod_hash_val_t)&dvp)) != 0) {
-		rw_exit(&i_dls_vlan_lock);
-		return (ENOENT);
-	}
-
-	/*
-	 * Check to see if it is referenced by any dls_impl_t.
-	 */
-	if (dvp->dv_ref != 0) {
-		rw_exit(&i_dls_vlan_lock);
-		return (EBUSY);
-	}
-
-	ASSERT(dvp->dv_zone_ref == 0);
-
-	/*
-	 * Remove and destroy the hash table entry.
-	 */
-	err = mod_hash_remove(i_dls_vlan_hash,
-	    (mod_hash_key_t)dvp->dv_spa, (mod_hash_val_t *)&val);
-	ASSERT(err == 0);
-	ASSERT(dvp == (dls_vlan_t *)val);
-
-	err = mod_hash_remove(i_dls_vlan_dev_hash,
-	    (mod_hash_key_t)dvp->dv_dev, (mod_hash_val_t *)&val);
-	ASSERT(err == 0);
-	ASSERT(dvp == (dls_vlan_t *)val);
-
-	if (vid != VLAN_ID_NONE && dvp->dv_ppa < MAC_MAX_MINOR) {
-		char		node[MAXPATHLEN];
-		char		*driver;
-
-		/*
-		 * Remove the minor nodes for this link.
-		 */
-		driver = (char *)ddi_driver_name(dvp->dv_dip);
-		(void) snprintf(node, MAXPATHLEN, "%s%d", driver,
-		    vid * 1000 + dvp->dv_ppa);
-		ddi_remove_minor_node(dvp->dv_dip, node);
-	}
-
-	dls_mac_stat_destroy(dvp);
-
-	ASSERT(i_dls_vlan_count > 0);
-	i_dls_vlan_count--;
-	rw_exit(&i_dls_vlan_lock);
-
-	if (vid != VLAN_ID_NONE) {
-		if (!dvp->dv_force) {
-			(void) mac_margin_remove(dvp->dv_dlp->dl_mh,
-			    VLAN_TAGSZ);
-		}
-		dls_mac_rele(dvp->dv_dlp);
-	}
-
-	/*
-	 * Release minor to dls_minor_arenap for VLANs
-	 */
-	if (vid != VLAN_ID_NONE)
-		mac_minor_rele(getminor(dvp->dv_dev));
-
-	/*
-	 * Release the dls_link_t. This will destroy the dls_link_t and
-	 * release the MAC if there are no more dls_vlan_t.
-	 */
-	dls_link_rele(dvp->dv_dlp);
-	kmem_cache_free(i_dls_vlan_cachep, dvp);
-	return (0);
-}
-
-int
-dls_vlan_hold(const char *macname, uint16_t vid, dls_vlan_t **dvpp,
-    boolean_t force, boolean_t create_vlan)
-{
-	char		spa[MAXSPALEN];
-	dls_vlan_t	*dvp;
-	boolean_t	vlan_created;
-	int		err = 0;
-
-	(void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid);
-
-again:
-	rw_enter(&i_dls_vlan_lock, RW_WRITER);
-	if ((err = mod_hash_find(i_dls_vlan_hash,
-	    (mod_hash_key_t)spa, (mod_hash_val_t)&dvp)) != 0) {
-
-		ASSERT(err == MH_ERR_NOTFOUND);
-
-		vlan_created = B_FALSE;
-		if (!create_vlan || vid == VLAN_ID_NONE) {
-			rw_exit(&i_dls_vlan_lock);
-			return (ENOENT);
-		}
-		rw_exit(&i_dls_vlan_lock);
-
-		err = dls_vlan_create(macname, vid, force);
-		if ((err != 0) && (err != EEXIST))
-			return (err);
-
-		/*
-		 * At this point someone else could do a dls_vlan_hold and
-		 * dls_vlan_rele on this new vlan and causes it to be
-		 * destroyed. This will at worst cause us to spin a few
-		 * times.
-		 */
-		vlan_created = (err != EEXIST);
-		goto again;
-	}
-
-	dvp->dv_ref++;
-	rw_exit(&i_dls_vlan_lock);
-
-	if ((err = dls_mac_hold(dvp->dv_dlp)) != 0) {
-		rw_enter(&i_dls_vlan_lock, RW_WRITER);
-		dvp->dv_ref--;
-		rw_exit(&i_dls_vlan_lock);
-		if (vlan_created)
-			(void) dls_vlan_destroy(macname, vid);
-		return (err);
-	}
-
-	*dvpp = dvp;
-	return (0);
-}
-
-int
-dls_vlan_hold_by_dev(dev_t dev, dls_vlan_t **dvpp)
-{
-	dls_vlan_t	*dvp;
-	int		err;
-
-	rw_enter(&i_dls_vlan_lock, RW_WRITER);
-	if ((err = mod_hash_find(i_dls_vlan_dev_hash, (mod_hash_key_t)dev,
-	    (mod_hash_val_t *)&dvp)) != 0) {
-		ASSERT(err == MH_ERR_NOTFOUND);
-		rw_exit(&i_dls_vlan_lock);
-		return (ENOENT);
-	}
-
-	dvp->dv_ref++;
-	rw_exit(&i_dls_vlan_lock);
-
-	if ((err = dls_mac_hold(dvp->dv_dlp)) != 0) {
-		rw_enter(&i_dls_vlan_lock, RW_WRITER);
-		dvp->dv_ref--;
-		rw_exit(&i_dls_vlan_lock);
-		return (err);
-	}
-
-	*dvpp = dvp;
-	return (0);
-}
-
-/*
- * Free the dvp if this is a VLAN and this is the last reference.
- */
-void
-dls_vlan_rele(dls_vlan_t *dvp)
-{
-	char		macname[MAXNAMELEN];
-	uint16_t	vid;
-	boolean_t	destroy_vlan = B_FALSE;
-
-	dls_mac_rele(dvp->dv_dlp);
-
-	rw_enter(&i_dls_vlan_lock, RW_WRITER);
-	if (--dvp->dv_ref != 0) {
-		rw_exit(&i_dls_vlan_lock);
-		return;
-	}
-
-	if (dvp->dv_id != VLAN_ID_NONE) {
-		destroy_vlan = B_TRUE;
-		(void) strncpy(macname, dvp->dv_dlp->dl_name, MAXNAMELEN);
-		vid = dvp->dv_id;
-	}
-	rw_exit(&i_dls_vlan_lock);
-
-	if (destroy_vlan)
-		(void) dls_vlan_destroy(macname, vid);
-}
-
-int
-dls_vlan_setzid(const char *mac, uint16_t vid, zoneid_t zid)
-{
-	dls_vlan_t	*dvp;
-	int		err;
-	zoneid_t	old_zid;
-
-	if ((err = dls_vlan_hold(mac, vid, &dvp, B_FALSE, B_TRUE)) != 0)
-		return (err);
-
-	mutex_enter(&dvp->dv_lock);
-	if ((old_zid = dvp->dv_zid) == zid) {
-		mutex_exit(&dvp->dv_lock);
-		goto done;
-	}
-
-	/*
-	 * Check whether this dvp is used by its own zones, if yes,
-	 * we cannot change its zoneid.
-	 */
-	if (dvp->dv_zone_ref != 0) {
-		mutex_exit(&dvp->dv_lock);
-		err = EBUSY;
-		goto done;
-	}
-
-	if (zid == GLOBAL_ZONEID) {
-		/*
-		 * Move the link from the local zone to the global zone,
-		 * and release the reference to this link.  At the same time
-		 * reset the link's active state so that an aggregation is
-		 * allowed to be created over it.
-		 */
-		dvp->dv_zid = zid;
-		mutex_exit(&dvp->dv_lock);
-		dls_mac_active_clear(dvp->dv_dlp);
-		dls_vlan_rele(dvp);
-		goto done;
-	} else if (old_zid == GLOBAL_ZONEID) {
-		/*
-		 * Move the link from the global zone to the local zone,
-		 * and hold a reference to this link.  Also, set the link
-		 * to the "active" state so that the global zone is
-		 * not able to create an aggregation over this link.
-		 * TODO: revisit once we allow creating aggregations
-		 * within a local zone.
-		 */
-		if (!dls_mac_active_set(dvp->dv_dlp)) {
-			mutex_exit(&dvp->dv_lock);
-			err = EBUSY;
-			goto done;
-		}
-		dvp->dv_zid = zid;
-		mutex_exit(&dvp->dv_lock);
-		return (0);
-	} else {
-		/*
-		 * Move the link from a local zone to another local zone.
-		 */
-		dvp->dv_zid = zid;
-		mutex_exit(&dvp->dv_lock);
-	}
-
-done:
-	dls_vlan_rele(dvp);
-	return (err);
-}
-
-/*
- * Find dev_info_t based on the minor node of the link.
- */
-dev_info_t *
-dls_finddevinfo(dev_t dev)
-{
-	dls_vlan_t	*dvp;
-	dev_info_t	*dip;
-
-	if (dls_vlan_hold_by_dev(dev, &dvp) != 0)
-		return (NULL);
-
-	dip = dvp->dv_dip;
-	dls_vlan_rele(dvp);
-	return (dip);
-}
diff --git a/usr/src/uts/common/io/dmfe/dmfe_impl.h b/usr/src/uts/common/io/dmfe/dmfe_impl.h
index 6792f540bd..978229574d 100644
--- a/usr/src/uts/common/io/dmfe/dmfe_impl.h
+++ b/usr/src/uts/common/io/dmfe/dmfe_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_DMFE_IMPL_H
 #define	_SYS_DMFE_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -56,7 +54,7 @@ extern "C" {
 #include <sys/sunddi.h>
 
 #include <sys/miiregs.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include "dmfe.h"
 
diff --git a/usr/src/uts/common/io/dmfe/dmfe_main.c b/usr/src/uts/common/io/dmfe/dmfe_main.c
index 152c14f1e8..c231f61ec4 100644
--- a/usr/src/uts/common/io/dmfe/dmfe_main.c
+++ b/usr/src/uts/common/io/dmfe/dmfe_main.c
@@ -207,12 +207,11 @@ static int		dmfe_m_promisc(void *, boolean_t);
 static int		dmfe_m_multicst(void *, boolean_t, const uint8_t *);
 static int		dmfe_m_unicst(void *, const uint8_t *);
 static void		dmfe_m_ioctl(void *, queue_t *, mblk_t *);
-static boolean_t	dmfe_m_getcapab(void *, mac_capab_t, void *);
 static mblk_t		*dmfe_m_tx(void *, mblk_t *);
 static int 		dmfe_m_stat(void *, uint_t, uint64_t *);
 
 static mac_callbacks_t dmfe_m_callbacks = {
-	(MC_IOCTL | MC_GETCAPAB),
+	(MC_IOCTL),
 	dmfe_m_stat,
 	dmfe_m_start,
 	dmfe_m_stop,
@@ -220,9 +219,8 @@ static mac_callbacks_t dmfe_m_callbacks = {
 	dmfe_m_multicst,
 	dmfe_m_unicst,
 	dmfe_m_tx,
-	NULL,
 	dmfe_m_ioctl,
-	dmfe_m_getcapab,
+	NULL,
 };
 
 
@@ -1621,46 +1619,6 @@ dmfe_m_promisc(void *arg, boolean_t on)
 	return (0);
 }
 
-/*ARGSUSED*/
-static boolean_t
-dmfe_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
-{
-	/*
-	 * Note that the chip could support some form of polling and
-	 * multiaddress support.  We should look into adding polling
-	 * support later, once Solaris is better positioned to take
-	 * advantage of it, although it may be of little use since
-	 * even a lowly 500MHz US-IIe should be able to keep up with
-	 * 100Mbps.  (Esp. if the packets are not unreasonably sized.)
-	 *
-	 * Multiaddress support, however, is likely to be of more
-	 * utility with crossbow and virtualized NICs.  Although, the
-	 * fact that dmfe is only supported on low-end US-IIe hardware
-	 * makes one wonder whether VNICs are likely to be used on
-	 * such platforms.  The chip certainly supports the notion,
-	 * since it can be run in HASH-ONLY mode.  (Though this would
-	 * require software to drop unicast packets that are
-	 * incorrectly received due to hash collision of the
-	 * destination mac address.)
-	 *
-	 * Interestingly enough, modern Davicom chips (the 9102D)
-	 * support full IP checksum offload, though its unclear
-	 * whether any of these chips are used on any systems that can
-	 * run Solaris.
-	 *
-	 * If this driver is ever supported on x86 hardware, then
-	 * these assumptions should be revisited.
-	 */
-	switch (cap) {
-	case MAC_CAPAB_POLL:
-	case MAC_CAPAB_MULTIADDRESS:
-	case MAC_CAPAB_HCKSUM:
-	default:
-		return (B_FALSE);
-	}
-}
-
-
 #undef	DMFE_DBG
 
 
diff --git a/usr/src/uts/common/io/e1000g/e1000g_main.c b/usr/src/uts/common/io/e1000g/e1000g_main.c
index 8bde171cbb..e7fe619c3e 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_main.c
+++ b/usr/src/uts/common/io/e1000g/e1000g_main.c
@@ -64,8 +64,6 @@ static uint_t e1000g_intr_pciexpress(caddr_t);
 static uint_t e1000g_intr(caddr_t);
 static void e1000g_intr_work(struct e1000g *, uint32_t);
 #pragma inline(e1000g_intr_work)
-static uint32_t e1000g_get_itr(uint32_t, uint32_t, uint32_t);
-#pragma inline(e1000g_get_itr)
 static int e1000g_init(struct e1000g *);
 static int e1000g_start(struct e1000g *, boolean_t);
 static void e1000g_stop(struct e1000g *, boolean_t);
@@ -73,11 +71,6 @@ static int e1000g_m_start(void *);
 static void e1000g_m_stop(void *);
 static int e1000g_m_promisc(void *, boolean_t);
 static boolean_t e1000g_m_getcapab(void *, mac_capab_t, void *);
-static int e1000g_m_unicst(void *, const uint8_t *);
-static int e1000g_m_unicst_add(void *, mac_multi_addr_t *);
-static int e1000g_m_unicst_remove(void *, mac_addr_slot_t);
-static int e1000g_m_unicst_modify(void *, mac_multi_addr_t *);
-static int e1000g_m_unicst_get(void *, mac_multi_addr_t *);
 static int e1000g_m_multicst(void *, boolean_t, const uint8_t *);
 static void e1000g_m_ioctl(void *, queue_t *, mblk_t *);
 static int e1000g_m_setprop(void *, const char *, mac_prop_id_t,
@@ -98,7 +91,7 @@ static int e1000g_register_mac(struct e1000g *);
 static boolean_t e1000g_rx_drain(struct e1000g *);
 static boolean_t e1000g_tx_drain(struct e1000g *);
 static void e1000g_init_unicst(struct e1000g *);
-static int e1000g_unicst_set(struct e1000g *, const uint8_t *, mac_addr_slot_t);
+static int e1000g_unicst_set(struct e1000g *, const uint8_t *, int);
 
 /*
  * Local routines
@@ -172,10 +165,8 @@ mac_priv_prop_t e1000g_priv_props[] = {
 	{"_rx_intr_abs_delay", MAC_PROP_PERM_RW},
 	{"_intr_throttling_rate", MAC_PROP_PERM_RW},
 	{"_intr_adaptive", MAC_PROP_PERM_RW},
-	{"_tx_recycle_thresh", MAC_PROP_PERM_RW},
 	{"_adv_pause_cap", MAC_PROP_PERM_READ},
 	{"_adv_asym_pause_cap", MAC_PROP_PERM_READ},
-	{"_tx_recycle_num", MAC_PROP_PERM_RW}
 };
 #define	E1000G_MAX_PRIV_PROPS	\
 	(sizeof (e1000g_priv_props)/sizeof (mac_priv_prop_t))
@@ -245,9 +236,8 @@ static mac_callbacks_t e1000g_m_callbacks = {
 	e1000g_m_stop,
 	e1000g_m_promisc,
 	e1000g_m_multicst,
-	e1000g_m_unicst,
-	e1000g_m_tx,
 	NULL,
+	e1000g_m_tx,
 	e1000g_m_ioctl,
 	e1000g_m_getcapab,
 	NULL,
@@ -607,6 +597,7 @@ e1000g_register_mac(struct e1000g *Adapter)
 	mac->m_margin = VLAN_TAGSZ;
 	mac->m_priv_props = e1000g_priv_props;
 	mac->m_priv_prop_count = E1000G_MAX_PRIV_PROPS;
+	mac->m_v12n = MAC_VIRT_LEVEL1;
 
 	err = mac_register(mac, &Adapter->mh);
 	mac_free(mac);
@@ -935,17 +926,17 @@ e1000g_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
 	if (Adapter == NULL)
 		return (DDI_FAILURE);
 
+	rx_drain = e1000g_rx_drain(Adapter);
+	if (!rx_drain && !e1000g_force_detach)
+		return (DDI_FAILURE);
+
 	if (mac_unregister(Adapter->mh) != 0) {
 		e1000g_log(Adapter, CE_WARN, "Unregister MAC failed");
 		return (DDI_FAILURE);
 	}
 	Adapter->attach_progress &= ~ATTACH_PROGRESS_MAC;
 
-
-	if (Adapter->chip_state != E1000G_STOP)
-		e1000g_stop(Adapter, B_TRUE);
-
-	rx_drain = e1000g_rx_drain(Adapter);
+	ASSERT(Adapter->chip_state == E1000G_STOP);
 
 	/*
 	 * If e1000g_force_detach is enabled, driver detach is safe.
@@ -955,9 +946,6 @@ e1000g_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
 	 */
 	if (e1000g_force_detach) {
 		e1000g_free_priv_devi_node(Adapter, rx_drain);
-	} else {
-		if (!rx_drain)
-			return (DDI_FAILURE);
 	}
 
 	e1000g_unattach(devinfo, Adapter);
@@ -1122,6 +1110,8 @@ e1000g_init_locks(struct e1000g *Adapter)
 	    MUTEX_DRIVER, DDI_INTR_PRI(Adapter->intr_pri));
 	mutex_init(&rx_ring->freelist_lock, NULL,
 	    MUTEX_DRIVER, DDI_INTR_PRI(Adapter->intr_pri));
+	mutex_init(&rx_ring->recycle_lock, NULL,
+	    MUTEX_DRIVER, DDI_INTR_PRI(Adapter->intr_pri));
 }
 
 static void
@@ -1138,6 +1128,7 @@ e1000g_destroy_locks(struct e1000g *Adapter)
 	rx_ring = Adapter->rx_ring;
 	mutex_destroy(&rx_ring->rx_lock);
 	mutex_destroy(&rx_ring->freelist_lock);
+	mutex_destroy(&rx_ring->recycle_lock);
 
 	mutex_destroy(&Adapter->link_lock);
 	mutex_destroy(&Adapter->watchdog_lock);
@@ -1432,6 +1423,8 @@ e1000g_init(struct e1000g *Adapter)
 		goto init_fail;
 	}
 
+	Adapter->poll_mode = e1000g_poll_mode;
+
 	rw_exit(&Adapter->chip_lock);
 
 	return (DDI_SUCCESS);
@@ -1549,6 +1542,106 @@ e1000g_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
 	}
 }
 
+/*
+ * The default value of e1000g_poll_mode == 0 assumes that the NIC is
+ * capable of supporting only one interrupt and we shouldn't disable
+ * the physical interrupt. In this case we let the interrupt come and
+ * we queue the packets in the rx ring itself in case we are in polling
+ * mode (better latency but slightly lower performance and a very
+ * high intrrupt count in mpstat which is harmless).
+ *
+ * e1000g_poll_mode == 1 assumes that we have per Rx ring interrupt
+ * which can be disabled in poll mode. This gives better overall
+ * throughput (compared to the mode above), shows very low interrupt
+ * count but has slightly higher latency since we pick the packets when
+ * the poll thread does polling.
+ *
+ * Currently, this flag should be enabled only while doing performance
+ * measurement or when it can be guaranteed that entire NIC going
+ * in poll mode will not harm any traffic like cluster heartbeat etc.
+ */
+int e1000g_poll_mode = 0;
+
+/*
+ * Called from the upper layers when driver is in polling mode to
+ * pick up any queued packets. Care should be taken to not block
+ * this thread.
+ */
+static mblk_t *e1000g_poll_ring(void *arg, int bytes_to_pickup)
+{
+	e1000g_rx_ring_t	*rx_ring = (e1000g_rx_ring_t *)arg;
+	mblk_t			*mp = NULL;
+	mblk_t			*tail;
+	uint_t			sz = 0;
+	struct e1000g 		*adapter;
+
+	adapter = rx_ring->adapter;
+
+	mutex_enter(&rx_ring->rx_lock);
+	ASSERT(rx_ring->poll_flag);
+
+	/*
+	 * Get any packets that have arrived. Works only if we
+	 * actually disable the physical adapter/rx_ring interrupt.
+	 * (e1000g_poll_mode == 1). In case e1000g_poll_mode == 0,
+	 * packets will have already been added to the poll list
+	 * by the interrupt (see e1000g_intr_work()).
+	 */
+	if (adapter->poll_mode) {
+		mp = e1000g_receive(rx_ring, &tail, &sz);
+		if (mp != NULL) {
+			if (rx_ring->poll_list_head == NULL)
+				rx_ring->poll_list_head = mp;
+			else
+				rx_ring->poll_list_tail->b_next = mp;
+			rx_ring->poll_list_tail = tail;
+			rx_ring->poll_list_sz += sz;
+		}
+	}
+
+	mp = rx_ring->poll_list_head;
+	if (mp == NULL) {
+		mutex_exit(&rx_ring->rx_lock);
+		return (NULL);
+	}
+
+	/* Check if we can sendup the entire chain */
+	if (bytes_to_pickup >= rx_ring->poll_list_sz) {
+		mp = rx_ring->poll_list_head;
+		rx_ring->poll_list_head = NULL;
+		rx_ring->poll_list_tail = NULL;
+		rx_ring->poll_list_sz = 0;
+		mutex_exit(&rx_ring->rx_lock);
+		return (mp);
+	}
+
+	/*
+	 * We need to find out how much chain we can send up. We
+	 * are guaranteed that atleast one packet will go up since
+	 * we already checked that.
+	 */
+	tail = mp;
+	sz = 0;
+	while (mp != NULL) {
+		sz += MBLKL(mp);
+		if (sz > bytes_to_pickup) {
+			sz -= MBLKL(mp);
+			break;
+		}
+		tail = mp;
+		mp = mp->b_next;
+	}
+
+	mp = rx_ring->poll_list_head;
+	rx_ring->poll_list_head = tail->b_next;
+	if (rx_ring->poll_list_head == NULL)
+		rx_ring->poll_list_tail = NULL;
+	rx_ring->poll_list_sz -= sz;
+	tail->b_next = NULL;
+	mutex_exit(&rx_ring->rx_lock);
+	return (mp);
+}
+
 static int
 e1000g_m_start(void *arg)
 {
@@ -1912,7 +2005,6 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr)
 	struct e1000_hw *hw;
 	hw = &Adapter->shared;
 	e1000g_tx_ring_t *tx_ring = Adapter->tx_ring;
-	uint32_t itr;
 
 	Adapter->rx_pkt_cnt = 0;
 	Adapter->tx_pkt_cnt = 0;
@@ -1929,16 +2021,79 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr)
 	}
 
 	if (icr & E1000_ICR_RXT0) {
-		mblk_t *mp;
+		mblk_t			*mp;
+		uint_t			sz = 0;
+		mblk_t			*tmp, *tail = NULL;
+		e1000g_rx_ring_t	*rx_ring;
 
-		mutex_enter(&Adapter->rx_ring->rx_lock);
-		mp = e1000g_receive(Adapter);
-		mutex_exit(&Adapter->rx_ring->rx_lock);
+		rx_ring = Adapter->rx_ring;
+		mutex_enter(&rx_ring->rx_lock);
 
+		/*
+		 * If the real interrupt for the Rx ring was
+		 * not disabled (e1000g_poll_mode == 0), then
+		 * we still pick up the packets and queue them
+		 * on Rx ring if we were in polling mode. this
+		 * enables the polling thread to pick up packets
+		 * really fast in polling mode and helps improve
+		 * latency.
+		 */
+		mp = e1000g_receive(rx_ring, &tail, &sz);
 		rw_exit(&Adapter->chip_lock);
 
-		if (mp != NULL)
-			mac_rx(Adapter->mh, Adapter->mrh, mp);
+		if (mp != NULL) {
+			ASSERT(tail != NULL);
+			if (!rx_ring->poll_flag) {
+				/*
+				 * If not polling, see if something was
+				 * already queued. Take care not to
+				 * reorder packets.
+				 */
+				if (rx_ring->poll_list_head == NULL) {
+					mutex_exit(&rx_ring->rx_lock);
+					mac_rx_ring(Adapter->mh, rx_ring->mrh,
+					    mp, rx_ring->ring_gen_num);
+				} else {
+					tmp = rx_ring->poll_list_head;
+					rx_ring->poll_list_head = NULL;
+					rx_ring->poll_list_tail->b_next = mp;
+					rx_ring->poll_list_tail = NULL;
+					rx_ring->poll_list_sz = 0;
+					mutex_exit(&rx_ring->rx_lock);
+					mac_rx_ring(Adapter->mh, rx_ring->mrh,
+					    tmp, rx_ring->ring_gen_num);
+				}
+			} else {
+				/*
+				 * We are in a polling mode. Put the
+				 * processed packets on the poll list.
+				 */
+				if (rx_ring->poll_list_head == NULL)
+					rx_ring->poll_list_head = mp;
+				else
+					rx_ring->poll_list_tail->b_next = mp;
+				rx_ring->poll_list_tail = tail;
+				rx_ring->poll_list_sz += sz;
+				mutex_exit(&rx_ring->rx_lock);
+			}
+		} else if (!rx_ring->poll_flag &&
+		    rx_ring->poll_list_head != NULL) {
+			/*
+			 * Nothing new has arrived (then why
+			 * was the interrupt raised??). Check
+			 * if something queued from the last
+			 * time.
+			 */
+			tmp = rx_ring->poll_list_head;
+			rx_ring->poll_list_head = NULL;
+			rx_ring->poll_list_tail = NULL;
+			rx_ring->poll_list_sz = 0;
+			mutex_exit(&rx_ring->rx_lock);
+			mac_rx_ring(Adapter->mh, rx_ring->mrh,
+			    tmp, rx_ring->ring_gen_num);
+		} else {
+			mutex_exit(&rx_ring->rx_lock);
+		}
 	} else
 		rw_exit(&Adapter->chip_lock);
 
@@ -1952,7 +2107,6 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr)
 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_intr);
 		rw_exit(&Adapter->chip_lock);
 
-		/* Schedule the re-transmit */
 		if (tx_ring->resched_needed &&
 		    (tx_ring->tbd_avail > DEFAULT_TX_UPDATE_THRESHOLD)) {
 			tx_ring->resched_needed = B_FALSE;
@@ -1961,15 +2115,6 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr)
 		}
 	}
 
-	if (Adapter->intr_adaptive) {
-		itr = e1000g_get_itr(Adapter->rx_pkt_cnt, Adapter->tx_pkt_cnt,
-		    Adapter->intr_throttling_rate);
-		if (itr) {
-			E1000_WRITE_REG(hw, E1000_ITR, itr);
-			Adapter->intr_throttling_rate = itr;
-		}
-	}
-
 	/*
 	 * The Receive Sequence errors RXSEQ and the link status change LSC
 	 * are checked to detect that the cable has been pulled out. For
@@ -2040,40 +2185,6 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr)
 	}
 }
 
-static uint32_t
-e1000g_get_itr(uint32_t rx_packet, uint32_t tx_packet, uint32_t cur_itr)
-{
-	uint32_t new_itr;
-
-	/*
-	 * Determine a propper itr according to rx/tx packet count
-	 * per interrupt, the value of itr are based on document
-	 * and testing.
-	 */
-	if ((rx_packet < DEFAULT_INTR_PACKET_LOW) ||
-	    (tx_packet < DEFAULT_INTR_PACKET_LOW)) {
-		new_itr = DEFAULT_INTR_THROTTLING_LOW;
-		goto itr_done;
-	}
-	if ((rx_packet > DEFAULT_INTR_PACKET_HIGH) ||
-	    (tx_packet > DEFAULT_INTR_PACKET_HIGH)) {
-		new_itr = DEFAULT_INTR_THROTTLING_LOW;
-		goto itr_done;
-	}
-	if (cur_itr < DEFAULT_INTR_THROTTLING_HIGH) {
-		new_itr = cur_itr + (DEFAULT_INTR_THROTTLING_HIGH >> 2);
-		if (new_itr > DEFAULT_INTR_THROTTLING_HIGH)
-			new_itr = DEFAULT_INTR_THROTTLING_HIGH;
-	} else
-		new_itr = DEFAULT_INTR_THROTTLING_HIGH;
-
-itr_done:
-	if (cur_itr == new_itr)
-		return (0);
-	else
-		return (new_itr);
-}
-
 static void
 e1000g_init_unicst(struct e1000g *Adapter)
 {
@@ -2082,45 +2193,33 @@ e1000g_init_unicst(struct e1000g *Adapter)
 
 	hw = &Adapter->shared;
 
-	if (!Adapter->unicst_init) {
+	if (Adapter->init_count == 0) {
 		/* Initialize the multiple unicast addresses */
 		Adapter->unicst_total = MAX_NUM_UNICAST_ADDRESSES;
 
+		/* Workaround for an erratum of 82571 chipst */
 		if ((hw->mac.type == e1000_82571) &&
 		    (e1000_get_laa_state_82571(hw) == B_TRUE))
 			Adapter->unicst_total--;
 
-		Adapter->unicst_avail = Adapter->unicst_total - 1;
+		Adapter->unicst_avail = Adapter->unicst_total;
 
-		/* Store the default mac address */
-		e1000_rar_set(hw, hw->mac.addr, 0);
-		if ((hw->mac.type == e1000_82571) &&
-		    (e1000_get_laa_state_82571(hw) == B_TRUE))
-			e1000_rar_set(hw, hw->mac.addr, LAST_RAR_ENTRY);
-
-		bcopy(hw->mac.addr, Adapter->unicst_addr[0].mac.addr,
-		    ETHERADDRL);
-		Adapter->unicst_addr[0].mac.set = 1;
-
-		for (slot = 1; slot < Adapter->unicst_total; slot++)
-			Adapter->unicst_addr[slot].mac.set = 0;
-
-		Adapter->unicst_init = B_TRUE;
+		for (slot = 0; slot < Adapter->unicst_total; slot++) {
+			/* Clear both the flag and MAC address */
+			Adapter->unicst_addr[slot].reg.high = 0;
+			Adapter->unicst_addr[slot].reg.low = 0;
+		}
 	} else {
-		/* Recover the default mac address */
-		bcopy(Adapter->unicst_addr[0].mac.addr, hw->mac.addr,
-		    ETHERADDRL);
-
-		/* Store the default mac address */
-		e1000_rar_set(hw, hw->mac.addr, 0);
+		/* Workaround for an erratum of 82571 chipst */
 		if ((hw->mac.type == e1000_82571) &&
 		    (e1000_get_laa_state_82571(hw) == B_TRUE))
 			e1000_rar_set(hw, hw->mac.addr, LAST_RAR_ENTRY);
 
 		/* Re-configure the RAR registers */
-		for (slot = 1; slot < Adapter->unicst_total; slot++)
-			e1000_rar_set(hw,
-			    Adapter->unicst_addr[slot].mac.addr, slot);
+		for (slot = 0; slot < Adapter->unicst_total; slot++)
+			if (Adapter->unicst_addr[slot].mac.set == 1)
+				e1000_rar_set(hw,
+				    Adapter->unicst_addr[slot].mac.addr, slot);
 	}
 
 	if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK)
@@ -2128,22 +2227,8 @@ e1000g_init_unicst(struct e1000g *Adapter)
 }
 
 static int
-e1000g_m_unicst(void *arg, const uint8_t *mac_addr)
-{
-	struct e1000g *Adapter;
-
-	Adapter = (struct e1000g *)arg;
-
-	/* Store the default MAC address */
-	bcopy(mac_addr, Adapter->shared.mac.addr, ETHERADDRL);
-
-	/* Set MAC address in address slot 0, which is the default address */
-	return (e1000g_unicst_set(Adapter, mac_addr, 0));
-}
-
-static int
 e1000g_unicst_set(struct e1000g *Adapter, const uint8_t *mac_addr,
-    mac_addr_slot_t slot)
+    int slot)
 {
 	struct e1000_hw *hw;
 
@@ -2166,14 +2251,36 @@ e1000g_unicst_set(struct e1000g *Adapter, const uint8_t *mac_addr,
 		E1000_WRITE_REG(hw, E1000_RCTL, E1000_RCTL_RST);
 		msec_delay(5);
 	}
+	if (mac_addr == NULL) {
+		E1000_WRITE_REG_ARRAY(hw, E1000_RA, slot << 1, 0);
+		E1000_WRITE_FLUSH(hw);
+		E1000_WRITE_REG_ARRAY(hw, E1000_RA, (slot << 1) + 1, 0);
+		E1000_WRITE_FLUSH(hw);
+		/* Clear both the flag and MAC address */
+		Adapter->unicst_addr[slot].reg.high = 0;
+		Adapter->unicst_addr[slot].reg.low = 0;
+	} else {
+		bcopy(mac_addr, Adapter->unicst_addr[slot].mac.addr,
+		    ETHERADDRL);
+		e1000_rar_set(hw, (uint8_t *)mac_addr, slot);
+		Adapter->unicst_addr[slot].mac.set = 1;
+	}
 
-	bcopy(mac_addr, Adapter->unicst_addr[slot].mac.addr, ETHERADDRL);
-	e1000_rar_set(hw, (uint8_t *)mac_addr, slot);
-
+	/* Workaround for an erratum of 82571 chipst */
 	if (slot == 0) {
 		if ((hw->mac.type == e1000_82571) &&
 		    (e1000_get_laa_state_82571(hw) == B_TRUE))
-			e1000_rar_set(hw, (uint8_t *)mac_addr, LAST_RAR_ENTRY);
+			if (mac_addr == NULL) {
+				E1000_WRITE_REG_ARRAY(hw, E1000_RA,
+				    slot << 1, 0);
+				E1000_WRITE_FLUSH(hw);
+				E1000_WRITE_REG_ARRAY(hw, E1000_RA,
+				    (slot << 1) + 1, 0);
+				E1000_WRITE_FLUSH(hw);
+			} else {
+				e1000_rar_set(hw, (uint8_t *)mac_addr,
+				    LAST_RAR_ENTRY);
+			}
 	}
 
 	/*
@@ -2192,7 +2299,6 @@ e1000g_unicst_set(struct e1000g *Adapter, const uint8_t *mac_addr,
 	}
 
 	rw_exit(&Adapter->chip_lock);
-
 	if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) {
 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
 		return (EIO);
@@ -2201,163 +2307,6 @@ e1000g_unicst_set(struct e1000g *Adapter, const uint8_t *mac_addr,
 	return (0);
 }
 
-/*
- * e1000g_m_unicst_add() - will find an unused address slot, set the
- * address value to the one specified, reserve that slot and enable
- * the NIC to start filtering on the new MAC address.
- * Returns 0 on success.
- */
-static int
-e1000g_m_unicst_add(void *arg, mac_multi_addr_t *maddr)
-{
-	struct e1000g *Adapter = (struct e1000g *)arg;
-	mac_addr_slot_t slot;
-	int err;
-
-	if (mac_unicst_verify(Adapter->mh,
-	    maddr->mma_addr, maddr->mma_addrlen) == B_FALSE)
-		return (EINVAL);
-
-	rw_enter(&Adapter->chip_lock, RW_WRITER);
-	if (Adapter->unicst_avail == 0) {
-		/* no slots available */
-		rw_exit(&Adapter->chip_lock);
-		return (ENOSPC);
-	}
-
-	/*
-	 * Primary/default address is in slot 0. The next addresses
-	 * are the multiple MAC addresses. So multiple MAC address 0
-	 * is in slot 1, 1 in slot 2, and so on. So the first multiple
-	 * MAC address resides in slot 1.
-	 */
-	for (slot = 1; slot < Adapter->unicst_total; slot++) {
-		if (Adapter->unicst_addr[slot].mac.set == 0) {
-			Adapter->unicst_addr[slot].mac.set = 1;
-			break;
-		}
-	}
-
-	ASSERT((slot > 0) && (slot < Adapter->unicst_total));
-
-	Adapter->unicst_avail--;
-	rw_exit(&Adapter->chip_lock);
-
-	maddr->mma_slot = slot;
-
-	if ((err = e1000g_unicst_set(Adapter, maddr->mma_addr, slot)) != 0) {
-		rw_enter(&Adapter->chip_lock, RW_WRITER);
-		Adapter->unicst_addr[slot].mac.set = 0;
-		Adapter->unicst_avail++;
-		rw_exit(&Adapter->chip_lock);
-	}
-
-	return (err);
-}
-
-/*
- * e1000g_m_unicst_remove() - removes a MAC address that was added by a
- * call to e1000g_m_unicst_add(). The slot number that was returned in
- * e1000g_m_unicst_add() is passed in the call to remove the address.
- * Returns 0 on success.
- */
-static int
-e1000g_m_unicst_remove(void *arg, mac_addr_slot_t slot)
-{
-	struct e1000g *Adapter = (struct e1000g *)arg;
-	int err;
-
-	if ((slot <= 0) || (slot >= Adapter->unicst_total))
-		return (EINVAL);
-
-	rw_enter(&Adapter->chip_lock, RW_WRITER);
-	if (Adapter->unicst_addr[slot].mac.set == 1) {
-		Adapter->unicst_addr[slot].mac.set = 0;
-		Adapter->unicst_avail++;
-		rw_exit(&Adapter->chip_lock);
-
-		/* Copy the default address to the passed slot */
-		if ((err = e1000g_unicst_set(Adapter,
-		    Adapter->unicst_addr[0].mac.addr, slot)) != 0) {
-			rw_enter(&Adapter->chip_lock, RW_WRITER);
-			Adapter->unicst_addr[slot].mac.set = 1;
-			Adapter->unicst_avail--;
-			rw_exit(&Adapter->chip_lock);
-		}
-		return (err);
-	}
-	rw_exit(&Adapter->chip_lock);
-
-	return (EINVAL);
-}
-
-/*
- * e1000g_m_unicst_modify() - modifies the value of an address that
- * has been added by e1000g_m_unicst_add(). The new address, address
- * length and the slot number that was returned in the call to add
- * should be passed to e1000g_m_unicst_modify(). mma_flags should be
- * set to 0. Returns 0 on success.
- */
-static int
-e1000g_m_unicst_modify(void *arg, mac_multi_addr_t *maddr)
-{
-	struct e1000g *Adapter = (struct e1000g *)arg;
-	mac_addr_slot_t slot;
-
-	if (mac_unicst_verify(Adapter->mh,
-	    maddr->mma_addr, maddr->mma_addrlen) == B_FALSE)
-		return (EINVAL);
-
-	slot = maddr->mma_slot;
-
-	if ((slot <= 0) || (slot >= Adapter->unicst_total))
-		return (EINVAL);
-
-	rw_enter(&Adapter->chip_lock, RW_WRITER);
-	if (Adapter->unicst_addr[slot].mac.set == 1) {
-		rw_exit(&Adapter->chip_lock);
-
-		return (e1000g_unicst_set(Adapter, maddr->mma_addr, slot));
-	}
-	rw_exit(&Adapter->chip_lock);
-
-	return (EINVAL);
-}
-
-/*
- * e1000g_m_unicst_get() - will get the MAC address and all other
- * information related to the address slot passed in mac_multi_addr_t.
- * mma_flags should be set to 0 in the call.
- * On return, mma_flags can take the following values:
- * 1) MMAC_SLOT_UNUSED
- * 2) MMAC_SLOT_USED | MMAC_VENDOR_ADDR
- * 3) MMAC_SLOT_UNUSED | MMAC_VENDOR_ADDR
- * 4) MMAC_SLOT_USED
- */
-static int
-e1000g_m_unicst_get(void *arg, mac_multi_addr_t *maddr)
-{
-	struct e1000g *Adapter = (struct e1000g *)arg;
-	mac_addr_slot_t slot;
-
-	slot = maddr->mma_slot;
-
-	if ((slot <= 0) || (slot >= Adapter->unicst_total))
-		return (EINVAL);
-
-	rw_enter(&Adapter->chip_lock, RW_WRITER);
-	if (Adapter->unicst_addr[slot].mac.set == 1) {
-		bcopy(Adapter->unicst_addr[slot].mac.addr,
-		    maddr->mma_addr, ETHERADDRL);
-		maddr->mma_flags = MMAC_SLOT_USED;
-	} else {
-		maddr->mma_flags = MMAC_SLOT_UNUSED;
-	}
-	rw_exit(&Adapter->chip_lock);
-
-	return (0);
-}
-
 static int
 multicst_add(struct e1000g *Adapter, const uint8_t *multiaddr)
 {
@@ -2586,6 +2535,274 @@ e1000g_m_promisc(void *arg, boolean_t on)
 	return (0);
 }
 
+/*
+ * Entry points to enable and disable interrupts at the granularity of
+ * a group.
+ * Turns the poll_mode for the whole adapter on and off to enable or
+ * override the ring level polling control over the hardware interrupts.
+ */
+static int
+e1000g_rx_group_intr_enable(mac_intr_handle_t arg)
+{
+	struct e1000g		*adapter = (struct e1000g *)arg;
+	e1000g_rx_ring_t *rx_ring = adapter->rx_ring;
+
+	/*
+	 * Later interrupts at the granularity of the this ring will
+	 * invoke mac_rx() with NULL, indicating the need for another
+	 * software classification.
+	 * We have a single ring usable per adapter now, so we only need to
+	 * reset the rx handle for that one.
+	 * When more RX rings can be used, we should update each one of them.
+	 */
+	mutex_enter(&rx_ring->rx_lock);
+	rx_ring->mrh = NULL;
+	adapter->poll_mode = B_FALSE;
+	mutex_exit(&rx_ring->rx_lock);
+	return (0);
+}
+
+static int
+e1000g_rx_group_intr_disable(mac_intr_handle_t arg)
+{
+	struct e1000g *adapter = (struct e1000g *)arg;
+	e1000g_rx_ring_t *rx_ring = adapter->rx_ring;
+
+	mutex_enter(&rx_ring->rx_lock);
+
+	/*
+	 * Later interrupts at the granularity of the this ring will
+	 * invoke mac_rx() with the handle for this ring;
+	 */
+	adapter->poll_mode = B_TRUE;
+	rx_ring->mrh = rx_ring->mrh_init;
+	mutex_exit(&rx_ring->rx_lock);
+	return (0);
+}
+
+/*
+ * Entry points to enable and disable interrupts at the granularity of
+ * a ring.
+ * adapter poll_mode controls whether we actually proceed with hardware
+ * interrupt toggling.
+ */
+static int
+e1000g_rx_ring_intr_enable(mac_intr_handle_t intrh)
+{
+	e1000g_rx_ring_t	*rx_ring = (e1000g_rx_ring_t *)intrh;
+	struct e1000g 		*adapter = rx_ring->adapter;
+	struct e1000_hw 	*hw = &adapter->shared;
+	uint32_t		intr_mask;
+	boolean_t		poll_mode;
+
+	mutex_enter(&rx_ring->rx_lock);
+	rx_ring->poll_flag = 0;
+	poll_mode = adapter->poll_mode;
+	mutex_exit(&rx_ring->rx_lock);
+
+	if (poll_mode) {
+		/* Rx interrupt enabling for MSI and legacy */
+		intr_mask = E1000_READ_REG(hw, E1000_IMS);
+		intr_mask |= E1000_IMS_RXT0;
+		E1000_WRITE_REG(hw, E1000_IMS, intr_mask);
+		E1000_WRITE_FLUSH(hw);
+
+		/* Trigger a Rx interrupt to check Rx ring */
+		E1000_WRITE_REG(hw, E1000_ICS, E1000_IMS_RXT0);
+		E1000_WRITE_FLUSH(hw);
+	}
+	return (0);
+}
+
+static int
+e1000g_rx_ring_intr_disable(mac_intr_handle_t intrh)
+{
+	e1000g_rx_ring_t	*rx_ring = (e1000g_rx_ring_t *)intrh;
+	struct e1000g 		*adapter = rx_ring->adapter;
+	struct e1000_hw 	*hw = &adapter->shared;
+	boolean_t		poll_mode;
+
+	/*
+	 * Once the adapter can support per Rx ring interrupt,
+	 * we should disable the real interrupt instead of just setting
+	 * the flag.
+	 */
+	mutex_enter(&rx_ring->rx_lock);
+	rx_ring->poll_flag = 1;
+	poll_mode = adapter->poll_mode;
+	mutex_exit(&rx_ring->rx_lock);
+
+	if (poll_mode) {
+		/* Rx interrupt disabling for MSI and legacy */
+		E1000_WRITE_REG(hw, E1000_IMC, E1000_IMS_RXT0);
+		E1000_WRITE_FLUSH(hw);
+	}
+	return (0);
+}
+
+/*
+ * e1000g_unicst_find - Find the slot for the specified unicast address
+ */
+static int
+e1000g_unicst_find(struct e1000g *Adapter, const uint8_t *mac_addr)
+{
+	int slot;
+
+	ASSERT(mutex_owned(&Adapter->gen_lock));
+
+	for (slot = 0; slot < Adapter->unicst_total; slot++) {
+		if (Adapter->unicst_addr[slot].mac.set == 1) {
+			if (bcmp(Adapter->unicst_addr[slot].mac.addr,
+			    mac_addr, ETHERADDRL) == 0)
+				return (slot);
+		} else
+			continue;
+	}
+
+	return (-1);
+}
+
+/*
+ * Entry points to add and remove a MAC address to a ring group.
+ * The caller takes care of adding and removing the MAC addresses
+ * to the filter via these two routines.
+ */
+
+static int
+e1000g_addmac(void *arg, const uint8_t *mac_addr)
+{
+	struct e1000g *Adapter = (struct e1000g *)arg;
+	int slot;
+
+	mutex_enter(&Adapter->gen_lock);
+
+	if (e1000g_unicst_find(Adapter, mac_addr) != -1) {
+		/* The same address is already in slot */
+		mutex_exit(&Adapter->gen_lock);
+		return (0);
+	}
+
+	if (Adapter->unicst_avail == 0) {
+		/* no slots available */
+		mutex_exit(&Adapter->gen_lock);
+		return (ENOSPC);
+	}
+
+	/* Search for a free slot */
+	for (slot = 0; slot < Adapter->unicst_total; slot++) {
+		if (Adapter->unicst_addr[slot].mac.set == 0)
+			break;
+	}
+	ASSERT(slot < Adapter->unicst_total);
+
+	e1000g_unicst_set(Adapter, mac_addr, slot);
+	Adapter->unicst_avail--;
+
+	mutex_exit(&Adapter->gen_lock);
+
+	return (0);
+}
+
+static int
+e1000g_remmac(void *arg, const uint8_t *mac_addr)
+{
+	struct e1000g *Adapter = (struct e1000g *)arg;
+	int slot;
+
+	mutex_enter(&Adapter->gen_lock);
+
+	slot = e1000g_unicst_find(Adapter, mac_addr);
+	if (slot == -1) {
+		mutex_exit(&Adapter->gen_lock);
+		return (EINVAL);
+	}
+
+	ASSERT(Adapter->unicst_addr[slot].mac.set);
+
+	/* Clear this slot */
+	e1000g_unicst_set(Adapter, NULL, slot);
+	Adapter->unicst_avail++;
+
+	mutex_exit(&Adapter->gen_lock);
+
+	return (0);
+}
+
+static int
+e1000g_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
+{
+	e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)rh;
+
+	mutex_enter(&rx_ring->rx_lock);
+	rx_ring->ring_gen_num = mr_gen_num;
+	mutex_exit(&rx_ring->rx_lock);
+	return (0);
+}
+
+/*
+ * Callback funtion for MAC layer to register all rings.
+ *
+ * The hardware supports a single group with currently only one ring
+ * available.
+ * Though not offering virtualization ability per se, exposing the
+ * group/ring still enables the polling and interrupt toggling.
+ */
+void
+e1000g_fill_ring(void *arg, mac_ring_type_t rtype, const int grp_index,
+    const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+	struct e1000g *Adapter = (struct e1000g *)arg;
+	e1000g_rx_ring_t *rx_ring = Adapter->rx_ring;
+	mac_intr_t *mintr;
+
+	/*
+	 * We advertised only RX group/rings, so the MAC framework shouldn't
+	 * ask for any thing else.
+	 */
+	ASSERT(rtype == MAC_RING_TYPE_RX && grp_index == 0 && ring_index == 0);
+
+	rx_ring->mrh = rx_ring->mrh_init = rh;
+	infop->mri_driver = (mac_ring_driver_t)rx_ring;
+	infop->mri_start = e1000g_ring_start;
+	infop->mri_stop = NULL;
+	infop->mri_poll = e1000g_poll_ring;
+
+	/* Ring level interrupts */
+	mintr = &infop->mri_intr;
+	mintr->mi_handle = (mac_intr_handle_t)rx_ring;
+	mintr->mi_enable = e1000g_rx_ring_intr_enable;
+	mintr->mi_disable = e1000g_rx_ring_intr_disable;
+}
+
+static void
+e1000g_fill_group(void *arg, mac_ring_type_t rtype, const int grp_index,
+    mac_group_info_t *infop, mac_group_handle_t gh)
+{
+	struct e1000g *Adapter = (struct e1000g *)arg;
+	mac_intr_t *mintr;
+
+	/*
+	 * We advertised a single RX ring. Getting a request for anything else
+	 * signifies a bug in the MAC framework.
+	 */
+	ASSERT(rtype == MAC_RING_TYPE_RX && grp_index == 0);
+
+	Adapter->rx_group = gh;
+
+	infop->mgi_driver = (mac_group_driver_t)Adapter;
+	infop->mgi_start = NULL;
+	infop->mgi_stop = NULL;
+	infop->mgi_addmac = e1000g_addmac;
+	infop->mgi_remmac = e1000g_remmac;
+	infop->mgi_count = 1;
+
+	/* Group level interrupts */
+	mintr = &infop->mgi_intr;
+	mintr->mi_handle = (mac_intr_handle_t)Adapter;
+	mintr->mi_enable = e1000g_rx_group_intr_enable;
+	mintr->mi_disable = e1000g_rx_group_intr_disable;
+}
+
 static boolean_t
 e1000g_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 {
@@ -2602,34 +2819,6 @@ e1000g_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 			return (B_FALSE);
 		break;
 	}
-	case MAC_CAPAB_POLL:
-		/*
-		 * There's nothing for us to fill in, simply returning
-		 * B_TRUE stating that we support polling is sufficient.
-		 */
-		break;
-
-	case MAC_CAPAB_MULTIADDRESS: {
-		multiaddress_capab_t *mmacp = cap_data;
-
-		/*
-		 * The number of MAC addresses made available by
-		 * this capability is one less than the total as
-		 * the primary address in slot 0 is counted in
-		 * the total.
-		 */
-		mmacp->maddr_naddr = Adapter->unicst_total - 1;
-		mmacp->maddr_naddrfree = Adapter->unicst_avail;
-		/* No multiple factory addresses, set mma_flag to 0 */
-		mmacp->maddr_flag = 0;
-		mmacp->maddr_handle = Adapter;
-		mmacp->maddr_add = e1000g_m_unicst_add;
-		mmacp->maddr_remove = e1000g_m_unicst_remove;
-		mmacp->maddr_modify = e1000g_m_unicst_modify;
-		mmacp->maddr_get = e1000g_m_unicst_get;
-		mmacp->maddr_reserve = NULL;
-		break;
-	}
 
 	case MAC_CAPAB_LSO: {
 		mac_capab_lso_t *cap_lso = cap_data;
@@ -2642,7 +2831,20 @@ e1000g_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 			return (B_FALSE);
 		break;
 	}
+	case MAC_CAPAB_RINGS: {
+		mac_capab_rings_t *cap_rings = cap_data;
 
+		/* No TX rings exposed yet */
+		if (cap_rings->mr_type != MAC_RING_TYPE_RX)
+			return (B_FALSE);
+
+		cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+		cap_rings->mr_rnum = 1;
+		cap_rings->mr_gnum = 1;
+		cap_rings->mr_rget = e1000g_fill_ring;
+		cap_rings->mr_gget = e1000g_fill_group;
+		break;
+	}
 	default:
 		return (B_FALSE);
 	}
@@ -3124,32 +3326,6 @@ e1000g_set_priv_prop(struct e1000g *Adapter, const char *pr_name,
 		}
 		return (err);
 	}
-	if (strcmp(pr_name, "_tx_recycle_thresh") == 0) {
-		if (pr_val == NULL) {
-			err = EINVAL;
-			return (err);
-		}
-		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
-		if (result < MIN_TX_RECYCLE_THRESHOLD ||
-		    result > MAX_TX_RECYCLE_THRESHOLD)
-			err = EINVAL;
-		else
-			Adapter->tx_recycle_thresh = (uint32_t)result;
-		return (err);
-	}
-	if (strcmp(pr_name, "_tx_recycle_num") == 0) {
-		if (pr_val == NULL) {
-			err = EINVAL;
-			return (err);
-		}
-		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
-		if (result < MIN_TX_RECYCLE_NUM ||
-		    result > MAX_TX_RECYCLE_NUM)
-			err = EINVAL;
-		else
-			Adapter->tx_recycle_num = (uint32_t)result;
-		return (err);
-	}
 	return (ENOTSUP);
 }
 
@@ -3236,18 +3412,6 @@ e1000g_get_priv_prop(struct e1000g *Adapter, const char *pr_name,
 		err = 0;
 		goto done;
 	}
-	if (strcmp(pr_name, "_tx_recycle_thresh") == 0) {
-		value = (is_default ? DEFAULT_TX_RECYCLE_THRESHOLD :
-		    Adapter->tx_recycle_thresh);
-		err = 0;
-		goto done;
-	}
-	if (strcmp(pr_name, "_tx_recycle_num") == 0) {
-		value = (is_default ? DEFAULT_TX_RECYCLE_NUM :
-		    Adapter->tx_recycle_num);
-		err = 0;
-		goto done;
-	}
 done:
 	if (err == 0) {
 		(void) snprintf(pr_val, pr_valsize, "%d", value);
@@ -3368,22 +3532,6 @@ e1000g_get_conf(struct e1000g *Adapter)
 	    B_TRUE : B_FALSE;
 
 	/*
-	 * Tx recycle threshold
-	 */
-	Adapter->tx_recycle_thresh =
-	    e1000g_get_prop(Adapter, "tx_recycle_thresh",
-	    MIN_TX_RECYCLE_THRESHOLD, MAX_TX_RECYCLE_THRESHOLD,
-	    DEFAULT_TX_RECYCLE_THRESHOLD);
-
-	/*
-	 * Tx recycle descriptor number
-	 */
-	Adapter->tx_recycle_num =
-	    e1000g_get_prop(Adapter, "tx_recycle_num",
-	    MIN_TX_RECYCLE_NUM, MAX_TX_RECYCLE_NUM,
-	    DEFAULT_TX_RECYCLE_NUM);
-
-	/*
 	 * Hardware checksum enable/disable parameter
 	 */
 	Adapter->tx_hcksum_enable =
@@ -3672,6 +3820,23 @@ e1000g_reset_link(struct e1000g *Adapter)
 }
 
 static void
+e1000g_timer_tx_resched(struct e1000g *Adapter)
+{
+	e1000g_tx_ring_t *tx_ring = Adapter->tx_ring;
+
+	if (tx_ring->resched_needed &&
+	    ((ddi_get_lbolt() - tx_ring->resched_timestamp) >
+	    drv_usectohz(1000000)) &&
+	    (Adapter->chip_state == E1000G_START) &&
+	    (tx_ring->tbd_avail >= DEFAULT_TX_NO_RESOURCE)) {
+		tx_ring->resched_needed = B_FALSE;
+		mac_tx_update(Adapter->mh);
+		E1000G_STAT(tx_ring->stat_reschedule);
+		E1000G_STAT(tx_ring->stat_timer_reschedule);
+	}
+}
+
+static void
 e1000g_local_timer(void *ws)
 {
 	struct e1000g *Adapter = (struct e1000g *)ws;
@@ -3683,10 +3848,11 @@ e1000g_local_timer(void *ws)
 
 	if (Adapter->chip_state == E1000G_ERROR) {
 		Adapter->reset_count++;
-		if (e1000g_global_reset(Adapter))
+		if (e1000g_global_reset(Adapter)) {
 			ddi_fm_service_impact(Adapter->dip,
 			    DDI_SERVICE_RESTORED);
-		else
+			e1000g_timer_tx_resched(Adapter);
+		} else
 			ddi_fm_service_impact(Adapter->dip,
 			    DDI_SERVICE_LOST);
 		return;
@@ -3697,10 +3863,11 @@ e1000g_local_timer(void *ws)
 		    "Tx stall detected. Activate automatic recovery.\n");
 		e1000g_fm_ereport(Adapter, DDI_FM_DEVICE_STALL);
 		Adapter->reset_count++;
-		if (e1000g_reset_adapter(Adapter))
+		if (e1000g_reset_adapter(Adapter)) {
 			ddi_fm_service_impact(Adapter->dip,
 			    DDI_SERVICE_RESTORED);
-		else
+			e1000g_timer_tx_resched(Adapter);
+		} else
 			ddi_fm_service_impact(Adapter->dip,
 			    DDI_SERVICE_LOST);
 		return;
@@ -3769,6 +3936,8 @@ e1000g_local_timer(void *ws)
 
 	if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK)
 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
+	else
+		e1000g_timer_tx_resched(Adapter);
 
 	restart_watchdog_timer(Adapter);
 }
diff --git a/usr/src/uts/common/io/e1000g/e1000g_rx.c b/usr/src/uts/common/io/e1000g/e1000g_rx.c
index 3bb4a5e90f..15d22b8c9a 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_rx.c
+++ b/usr/src/uts/common/io/e1000g/e1000g_rx.c
@@ -20,7 +20,7 @@
 
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDLv1.
+ * Use is subject to license terms.
  */
 
 /*
@@ -147,10 +147,16 @@ e1000g_rxfree_func(p_rx_sw_packet_t packet)
 		}
 	}
 
-	mutex_enter(&rx_ring->freelist_lock);
-	QUEUE_PUSH_TAIL(&rx_ring->free_list, &packet->Link);
-	rx_ring->avail_freepkt++;
-	mutex_exit(&rx_ring->freelist_lock);
+	/*
+	 * Enqueue the recycled packets in a recycle queue. When freelist
+	 * dries up, move the entire chain of packets from recycle queue
+	 * to freelist. This helps in avoiding per packet mutex contention
+	 * around freelist.
+	 */
+	mutex_enter(&rx_ring->recycle_lock);
+	QUEUE_PUSH_TAIL(&rx_ring->recycle_list, &packet->Link);
+	rx_ring->recycle_freepkt++;
+	mutex_exit(&rx_ring->recycle_lock);
 
 	rw_exit(&e1000g_rx_detach_lock);
 }
@@ -236,6 +242,8 @@ e1000g_rx_setup(struct e1000g *Adapter)
 		/* Init the list of "Free Receive Buffer" */
 		QUEUE_INIT_LIST(&rx_ring->free_list);
 
+		/* Init the list of "Free Receive Buffer" */
+		QUEUE_INIT_LIST(&rx_ring->recycle_list);
 		/*
 		 * Setup Receive list and the Free list. Note that
 		 * the both were allocated in one packet area.
@@ -263,6 +271,7 @@ e1000g_rx_setup(struct e1000g *Adapter)
 			    &packet->Link);
 		}
 		rx_ring->avail_freepkt = Adapter->rx_freelist_num;
+		rx_ring->recycle_freepkt = 0;
 
 		Adapter->rx_buffer_setup = B_TRUE;
 	} else {
@@ -414,8 +423,23 @@ e1000g_get_buf(e1000g_rx_ring_t *rx_ring)
 	mutex_enter(&rx_ring->freelist_lock);
 	packet = (p_rx_sw_packet_t)
 	    QUEUE_POP_HEAD(&rx_ring->free_list);
-	if (packet != NULL)
+	if (packet != NULL) {
 		rx_ring->avail_freepkt--;
+	} else {
+		/*
+		 * If the freelist has no packets, check the recycle list
+		 * to see if there are any available descriptor there.
+		 */
+		mutex_enter(&rx_ring->recycle_lock);
+		QUEUE_SWITCH(&rx_ring->free_list, &rx_ring->recycle_list);
+		rx_ring->avail_freepkt = rx_ring->recycle_freepkt;
+		rx_ring->recycle_freepkt = 0;
+		mutex_exit(&rx_ring->recycle_lock);
+		packet = (p_rx_sw_packet_t)
+		    QUEUE_POP_HEAD(&rx_ring->free_list);
+		if (packet != NULL)
+			rx_ring->avail_freepkt--;
+	}
 	mutex_exit(&rx_ring->freelist_lock);
 
 	return (packet);
@@ -427,7 +451,7 @@ e1000g_get_buf(e1000g_rx_ring_t *rx_ring)
  * This routine will process packets received in an interrupt
  */
 mblk_t *
-e1000g_receive(struct e1000g *Adapter)
+e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz)
 {
 	struct e1000_hw *hw;
 	mblk_t *nmp;
@@ -443,7 +467,7 @@ e1000g_receive(struct e1000g *Adapter)
 	boolean_t accept_frame;
 	boolean_t end_of_packet;
 	boolean_t need_copy;
-	e1000g_rx_ring_t *rx_ring;
+	struct e1000g *Adapter;
 	dma_buffer_t *rx_buf;
 	uint16_t cksumflags;
 
@@ -452,9 +476,10 @@ e1000g_receive(struct e1000g *Adapter)
 	pkt_count = 0;
 	desc_count = 0;
 	cksumflags = 0;
+	*sz = 0;
 
+	Adapter = rx_ring->adapter;
 	hw = &Adapter->shared;
-	rx_ring = Adapter->rx_ring;
 
 	/* Sync the Rx descriptor DMA buffers */
 	(void) ddi_dma_sync(rx_ring->rbd_dma_handle,
@@ -805,6 +830,8 @@ rx_end_of_packet:
 			ret_nmp = rx_ring->rx_mblk;
 		}
 		ret_nmp->b_next = NULL;
+		*tail = ret_nmp;
+		*sz += length;
 
 		rx_ring->rx_mblk = NULL;
 		rx_ring->rx_mblk_tail = NULL;
diff --git a/usr/src/uts/common/io/e1000g/e1000g_stat.c b/usr/src/uts/common/io/e1000g/e1000g_stat.c
index 7df4317e9e..0c67c914a5 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_stat.c
+++ b/usr/src/uts/common/io/e1000g/e1000g_stat.c
@@ -20,7 +20,7 @@
 
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDLv1.
+ * Use is subject to license terms.
  */
 
 /*
@@ -185,7 +185,8 @@ e1000g_update_stats(kstat_t *ksp, int rw)
 	e1000g_ksp->rx_none.value.ul = rx_ring->stat_none;
 	e1000g_ksp->rx_multi_desc.value.ul = rx_ring->stat_multi_desc;
 	e1000g_ksp->rx_no_freepkt.value.ul = rx_ring->stat_no_freepkt;
-	e1000g_ksp->rx_avail_freepkt.value.ul = rx_ring->avail_freepkt;
+	e1000g_ksp->rx_avail_freepkt.value.ul = rx_ring->avail_freepkt +
+	    rx_ring->recycle_freepkt;
 
 	e1000g_ksp->tx_under_size.value.ul = tx_ring->stat_under_size;
 	e1000g_ksp->tx_exceed_frags.value.ul = tx_ring->stat_exceed_frags;
diff --git a/usr/src/uts/common/io/e1000g/e1000g_sw.h b/usr/src/uts/common/io/e1000g/e1000g_sw.h
index 605440cd48..e7c56a5877 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_sw.h
+++ b/usr/src/uts/common/io/e1000g/e1000g_sw.h
@@ -54,7 +54,7 @@ extern "C" {
 #include <sys/kstat.h>
 #include <sys/modctl.h>
 #include <sys/errno.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/vlan.h>
 #include <sys/ddi.h>
@@ -114,8 +114,6 @@ extern "C" {
 #define	MAX_INTR_THROTTLING		65535
 #define	MAX_RX_BCOPY_THRESHOLD		E1000_RX_BUFFER_SIZE_2K
 #define	MAX_TX_BCOPY_THRESHOLD		E1000_TX_BUFFER_SIZE_2K
-#define	MAX_TX_RECYCLE_THRESHOLD	MAX_NUM_TX_DESCRIPTOR
-#define	MAX_TX_RECYCLE_NUM		MAX_NUM_TX_DESCRIPTOR
 
 #define	MIN_NUM_TX_DESCRIPTOR		80
 #define	MIN_NUM_RX_DESCRIPTOR		80
@@ -129,8 +127,6 @@ extern "C" {
 #define	MIN_INTR_THROTTLING		0
 #define	MIN_RX_BCOPY_THRESHOLD		0
 #define	MIN_TX_BCOPY_THRESHOLD		ETHERMIN
-#define	MIN_TX_RECYCLE_THRESHOLD	0
-#define	MIN_TX_RECYCLE_NUM		MAX_TX_DESC_PER_PACKET
 
 #define	DEFAULT_NUM_RX_DESCRIPTOR	2048
 #define	DEFAULT_NUM_TX_DESCRIPTOR	2048
@@ -143,13 +139,11 @@ extern "C" {
 #define	MIN_INTR_PER_SEC		3000
 #define	DEFAULT_INTR_PACKET_LOW		5
 #define	DEFAULT_INTR_PACKET_HIGH	128
-#define	DEFAULT_TX_RECYCLE_THRESHOLD	512
 #else
 #define	MAX_INTR_PER_SEC		15000
 #define	MIN_INTR_PER_SEC		4000
 #define	DEFAULT_INTR_PACKET_LOW		10
 #define	DEFAULT_INTR_PACKET_HIGH	48
-#define	DEFAULT_TX_RECYCLE_THRESHOLD	DEFAULT_TX_NO_RESOURCE
 #endif
 
 #define	DEFAULT_RX_INTR_DELAY		0
@@ -162,7 +156,6 @@ extern "C" {
 
 #define	DEFAULT_RX_BCOPY_THRESHOLD	128
 #define	DEFAULT_TX_BCOPY_THRESHOLD	512
-#define	DEFAULT_TX_RECYCLE_NUM		64
 #define	DEFAULT_TX_UPDATE_THRESHOLD	256
 #define	DEFAULT_TX_NO_RESOURCE		MAX_TX_DESC_PER_PACKET
 
@@ -402,6 +395,14 @@ extern "C" {
 		(_LH1)->Blink = ((PSINGLE_LIST_LINK)(_LH2)->Blink); \
 	}
 
+
+#define	QUEUE_SWITCH(_LH1, _LH2)					\
+	if ((_LH2)->Flink) { 						\
+		(_LH1)->Flink = (_LH2)->Flink;				\
+		(_LH1)->Blink = (_LH2)->Blink;				\
+		(_LH2)->Flink = (_LH2)->Blink = (PSINGLE_LIST_LINK)0;	\
+	}
+
 /*
  * Property lookups
  */
@@ -717,6 +718,7 @@ typedef struct _e1000g_tx_ring {
 	 * reschedule when tx resource is available
 	 */
 	boolean_t resched_needed;
+	clock_t resched_timestamp;
 	uint32_t stall_watchdog;
 	uint32_t recycle_fail;
 	mblk_list_t mblks;
@@ -727,6 +729,7 @@ typedef struct _e1000g_tx_ring {
 	uint32_t stat_no_desc;
 	uint32_t stat_send_fail;
 	uint32_t stat_reschedule;
+	uint32_t stat_timer_reschedule;
 	uint32_t stat_over_size;
 #ifdef E1000G_DEBUG
 	uint32_t stat_under_size;
@@ -752,6 +755,7 @@ typedef struct _e1000g_tx_ring {
 typedef struct _e1000g_rx_ring {
 	kmutex_t rx_lock;
 	kmutex_t freelist_lock;
+	kmutex_t recycle_lock;
 	/*
 	 * Descriptor queue definitions
 	 */
@@ -768,13 +772,23 @@ typedef struct _e1000g_rx_ring {
 	p_rx_sw_packet_t packet_area;
 	LIST_DESCRIBER recv_list;
 	LIST_DESCRIBER free_list;
+	LIST_DESCRIBER recycle_list;
 
 	p_rx_sw_packet_t pending_list;
 	uint32_t pending_count;
 	uint32_t avail_freepkt;
+	uint32_t recycle_freepkt;
 	uint32_t rx_mblk_len;
 	mblk_t *rx_mblk;
 	mblk_t *rx_mblk_tail;
+	mac_ring_handle_t mrh;
+	mac_ring_handle_t mrh_init;
+	uint64_t ring_gen_num;
+	mblk_t *poll_list_head;
+	mblk_t *poll_list_tail;
+	uint_t poll_list_sz;
+	boolean_t poll_flag;
+
 	/*
 	 * Statistics
 	 */
@@ -833,8 +847,6 @@ typedef struct e1000g {
 
 	boolean_t intr_adaptive;
 	boolean_t tx_intr_enable;
-	uint32_t tx_recycle_thresh;
-	uint32_t tx_recycle_num;
 	uint32_t tx_intr_delay;
 	uint32_t tx_intr_abs_delay;
 	uint32_t rx_intr_delay;
@@ -853,6 +865,9 @@ typedef struct e1000g {
 
 	e1000g_rx_ring_t rx_ring[1];
 	e1000g_tx_ring_t tx_ring[1];
+	mac_group_handle_t rx_group;
+
+	kmutex_t gen_lock; /* General lock for the whole struct e1000g */
 
 	/*
 	 * Rx and Tx packet count for interrupt adaptive setting
@@ -909,6 +924,8 @@ typedef struct e1000g {
 
 	kstat_t *e1000g_ksp;
 
+	boolean_t poll_mode;
+
 	uint16_t phy_ctrl;		/* contents of PHY_CTRL */
 	uint16_t phy_status;		/* contents of PHY_STATUS */
 	uint16_t phy_an_adv;		/* contents of PHY_AUTONEG_ADV */
@@ -980,7 +997,7 @@ void e1000g_free_tx_swpkt(p_tx_sw_packet_t packet);
 void e1000g_tx_freemsg(e1000g_tx_ring_t *tx_ring);
 uint_t e1000g_tx_softint_worker(caddr_t arg1, caddr_t arg2);
 mblk_t *e1000g_m_tx(void *arg, mblk_t *mp);
-mblk_t *e1000g_receive(struct e1000g *Adapter);
+mblk_t *e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz);
 void e1000g_rxfree_func(p_rx_sw_packet_t packet);
 
 int e1000g_m_stat(void *arg, uint_t stat, uint64_t *val);
@@ -1008,6 +1025,7 @@ extern boolean_t e1000g_force_detach;
 extern uint32_t e1000g_mblks_pending;
 extern krwlock_t e1000g_rx_detach_lock;
 extern private_devi_list_t *e1000g_private_devi_list;
+extern int e1000g_poll_mode;
 
 #ifdef __cplusplus
 }
diff --git a/usr/src/uts/common/io/e1000g/e1000g_tx.c b/usr/src/uts/common/io/e1000g/e1000g_tx.c
index 4255c098b4..d67b67ff63 100644
--- a/usr/src/uts/common/io/e1000g/e1000g_tx.c
+++ b/usr/src/uts/common/io/e1000g/e1000g_tx.c
@@ -20,7 +20,7 @@
 
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDLv1.
+ * Use is subject to license terms.
  */
 
 /*
@@ -211,8 +211,7 @@ e1000g_send(struct e1000g *Adapter, mblk_t *mp)
 	 * Descriptors... As you may run short of them before getting any
 	 * transmit interrupt...
 	 */
-	if (tx_ring->resched_needed ||
-	    (tx_ring->tbd_avail < Adapter->tx_recycle_thresh)) {
+	if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
 		(void) e1000g_recycle(tx_ring);
 		E1000G_DEBUG_STAT(tx_ring->stat_recycle);
 
@@ -406,6 +405,7 @@ tx_send_failed:
 	 * Enable Transmit interrupts, so that the interrupt routine can
 	 * call mac_tx_update() when transmit descriptors become available.
 	 */
+	tx_ring->resched_timestamp = ddi_get_lbolt();
 	tx_ring->resched_needed = B_TRUE;
 	if (!Adapter->tx_intr_enable)
 		e1000g_mask_tx_interrupt(Adapter);
@@ -434,6 +434,7 @@ tx_no_resource:
 	 * Enable Transmit interrupts, so that the interrupt routine can
 	 * call mac_tx_update() when transmit descriptors become available.
 	 */
+	tx_ring->resched_timestamp = ddi_get_lbolt();
 	tx_ring->resched_needed = B_TRUE;
 	if (!Adapter->tx_intr_enable)
 		e1000g_mask_tx_interrupt(Adapter);
@@ -449,9 +450,14 @@ e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context,
 	uintptr_t ip_start;
 	uintptr_t tcp_start;
 	mblk_t *nmp;
+	uint32_t lsoflags;
+	uint32_t mss;
 
 	bzero(cur_context, sizeof (context_data_t));
 
+	/* first check lso information */
+	lso_info_get(mp, &mss, &lsoflags);
+
 	/* retrieve checksum info */
 	hcksum_retrieve(mp, NULL, NULL, &cur_context->cksum_start,
 	    &cur_context->cksum_stuff, NULL, NULL, &cur_context->cksum_flags);
@@ -464,45 +470,48 @@ e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context,
 		cur_context->ether_header_size =
 		    sizeof (struct ether_header);
 
-	if (cur_context->cksum_flags & HW_LSO) {
-		if ((cur_context->mss = DB_LSOMSS(mp)) != 0) {
-			/* free the invaid packet */
-			if (!((cur_context->cksum_flags & HCK_PARTIALCKSUM) &&
-			    (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) {
-				return (B_FALSE);
-			}
-			cur_context->lso_flag = B_TRUE;
-			/*
-			 * Some fields are cleared for the hardware to fill
-			 * in. We don't assume Ethernet header, IP header and
-			 * TCP header are always in the same mblk fragment,
-			 * while we assume each header is always within one
-			 * mblk fragment and Ethernet header is always in the
-			 * first mblk fragment.
-			 */
-			nmp = mp;
-			ip_start = (uintptr_t)(nmp->b_rptr)
-			    + cur_context->ether_header_size;
-			if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
-				ip_start = (uintptr_t)nmp->b_cont->b_rptr
-				    + (ip_start - (uintptr_t)(nmp->b_wptr));
-				nmp = nmp->b_cont;
-			}
-			tcp_start = ip_start +
-			    IPH_HDR_LENGTH((ipha_t *)ip_start);
-			if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
-				tcp_start = (uintptr_t)nmp->b_cont->b_rptr
-				    + (tcp_start - (uintptr_t)(nmp->b_wptr));
-				nmp = nmp->b_cont;
-			}
-			cur_context->hdr_len = cur_context->ether_header_size
-			    + IPH_HDR_LENGTH((ipha_t *)ip_start)
-			    + TCP_HDR_LENGTH((tcph_t *)tcp_start);
-			((ipha_t *)ip_start)->ipha_length = 0;
-			((ipha_t *)ip_start)->ipha_hdr_checksum = 0;
-			/* calculate the TCP packet payload length */
-			cur_context->pay_len = msg_size - cur_context->hdr_len;
+	if (lsoflags & HW_LSO) {
+		ASSERT(mss != 0);
+
+		/* free the invalid packet */
+		if (mss == 0 ||
+		    !((cur_context->cksum_flags & HCK_PARTIALCKSUM) &&
+		    (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) {
+			return (B_FALSE);
+		}
+		cur_context->mss = (uint16_t)mss;
+		cur_context->lso_flag = B_TRUE;
+
+		/*
+		 * Some fields are cleared for the hardware to fill
+		 * in. We don't assume Ethernet header, IP header and
+		 * TCP header are always in the same mblk fragment,
+		 * while we assume each header is always within one
+		 * mblk fragment and Ethernet header is always in the
+		 * first mblk fragment.
+		 */
+		nmp = mp;
+		ip_start = (uintptr_t)(nmp->b_rptr)
+		    + cur_context->ether_header_size;
+		if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
+			ip_start = (uintptr_t)nmp->b_cont->b_rptr
+			    + (ip_start - (uintptr_t)(nmp->b_wptr));
+			nmp = nmp->b_cont;
 		}
+		tcp_start = ip_start +
+		    IPH_HDR_LENGTH((ipha_t *)ip_start);
+		if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
+			tcp_start = (uintptr_t)nmp->b_cont->b_rptr
+			    + (tcp_start - (uintptr_t)(nmp->b_wptr));
+			nmp = nmp->b_cont;
+		}
+		cur_context->hdr_len = cur_context->ether_header_size
+		    + IPH_HDR_LENGTH((ipha_t *)ip_start)
+		    + TCP_HDR_LENGTH((tcph_t *)tcp_start);
+		((ipha_t *)ip_start)->ipha_length = 0;
+		((ipha_t *)ip_start)->ipha_hdr_checksum = 0;
+		/* calculate the TCP packet payload length */
+		cur_context->pay_len = msg_size - cur_context->hdr_len;
 	}
 	return (B_TRUE);
 }
@@ -816,7 +825,6 @@ e1000g_fill_tx_ring(e1000g_tx_ring_t *tx_ring, LIST_DESCRIBER *pending_list,
 	return (desc_count);
 }
 
-
 /*
  * e1000g_tx_setup - setup tx data structures
  *
@@ -955,7 +963,6 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring)
 	mblk_t *nmp;
 	struct e1000_tx_desc *descriptor;
 	int desc_count;
-	int is_intr;
 
 	/*
 	 * This function will examine each TxSwPacket in the 'used' queue
@@ -972,13 +979,6 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring)
 		return (0);
 	}
 
-	is_intr = servicing_interrupt();
-
-	if (is_intr)
-		mutex_enter(&tx_ring->usedlist_lock);
-	else if (mutex_tryenter(&tx_ring->usedlist_lock) == 0)
-		return (0);
-
 	desc_count = 0;
 	QUEUE_INIT_LIST(&pending_list);
 
@@ -987,7 +987,6 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring)
 	    0, 0, DDI_DMA_SYNC_FORKERNEL);
 	if (e1000g_check_dma_handle(
 	    tx_ring->tbd_dma_handle) != DDI_FM_OK) {
-		mutex_exit(&tx_ring->usedlist_lock);
 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
 		Adapter->chip_state = E1000G_ERROR;
 		return (0);
@@ -996,6 +995,7 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring)
 	/*
 	 * While there are still TxSwPackets in the used queue check them
 	 */
+	mutex_enter(&tx_ring->usedlist_lock);
 	while ((packet =
 	    (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list)) != NULL) {
 
@@ -1030,9 +1030,6 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring)
 				    descriptor + 1;
 
 			desc_count += packet->num_desc;
-
-			if (is_intr && (desc_count >= Adapter->tx_recycle_num))
-				break;
 		} else {
 			/*
 			 * Found a sw packet that the e1000g is not done
diff --git a/usr/src/uts/common/io/hxge/hxge.h b/usr/src/uts/common/io/hxge/hxge.h
index 837cbbc90c..37183afc7d 100644
--- a/usr/src/uts/common/io/hxge/hxge.h
+++ b/usr/src/uts/common/io/hxge/hxge.h
@@ -202,7 +202,6 @@ typedef struct _hxge_stats_t {
 
 	hxge_pfc_stats_t	pfc_stats;	/* pfc stats */
 	hxge_port_stats_t	port_stats;	/* port stats */
-	hxge_mmac_stats_t	mmac_stats;	/* Multi mac. stats */
 
 	hxge_peu_sys_stats_t	peu_sys_stats;	/* PEU system stats */
 } hxge_stats_t, *p_hxge_stats_t;
@@ -357,7 +356,6 @@ struct _hxge_t {
 	uint32_t 		hxge_port_rbr_size;
 	uint32_t 		hxge_port_rcr_size;
 	uint32_t 		hxge_port_tx_ring_size;
-	hxge_mmac_t		hxge_mmac_info;
 
 	kmutex_t		pio_lock;
 	hxge_timeout		timeout;
diff --git a/usr/src/uts/common/io/hxge/hxge_impl.h b/usr/src/uts/common/io/hxge/hxge_impl.h
index 57ad2c9a21..67bab83787 100644
--- a/usr/src/uts/common/io/hxge/hxge_impl.h
+++ b/usr/src/uts/common/io/hxge/hxge_impl.h
@@ -68,8 +68,7 @@ extern "C" {
 #include <sys/netlb.h>
 #include <sys/ddi_intr.h>
 
-#include <sys/mac.h>
-#include <sys/mac_impl.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 
 /*
@@ -315,32 +314,6 @@ typedef struct _dev_regs_t {
 	unsigned char		*hxge_romp;	/* fcode pointer */
 } dev_regs_t, *p_dev_regs_t;
 
-typedef struct _nxge_mac_addr_t {
-	ether_addr_t	addr;
-	uint_t		flags;
-} hxge_mac_addr_t;
-
-/*
- * Driver alternate mac address structure.
- */
-typedef struct _hxge_mmac_t {
-	uint8_t		total_factory_macs;
-	uint8_t		num_mmac;
-	uint8_t		num_factory_mmac;
-	hxge_mac_addr_t	mac_pool[16];
-	ether_addr_t	factory_mac_pool[16];
-	uint8_t		naddrfree;  /* number of alt mac addr available */
-} hxge_mmac_t;
-
-/*
- * mmac stats structure
- */
-typedef struct _hxge_mmac_stats_t {
-	uint8_t mmac_max_cnt;
-	uint8_t	mmac_avail_cnt;
-	struct ether_addr mmac_avail_pool[16];
-} hxge_mmac_stats_t, *p_hxge_mmac_stats_t;
-
 #include <hxge_common_impl.h>
 #include <hxge_common.h>
 #include <hxge_rxdma.h>
diff --git a/usr/src/uts/common/io/hxge/hxge_kstats.c b/usr/src/uts/common/io/hxge/hxge_kstats.c
index 9e3a86e953..1629c7c828 100644
--- a/usr/src/uts/common/io/hxge/hxge_kstats.c
+++ b/usr/src/uts/common/io/hxge/hxge_kstats.c
@@ -261,50 +261,6 @@ hxge_kstat_index_t hxge_pfc_stats[] = {
 };
 
 typedef enum {
-	MMAC_MAX_ADDR,
-	MMAC_AVAIL_ADDR,
-	MMAC_ADDR_POOL1,
-	MMAC_ADDR_POOL2,
-	MMAC_ADDR_POOL3,
-	MMAC_ADDR_POOL4,
-	MMAC_ADDR_POOL5,
-	MMAC_ADDR_POOL6,
-	MMAC_ADDR_POOL7,
-	MMAC_ADDR_POOL8,
-	MMAC_ADDR_POOL9,
-	MMAC_ADDR_POOL10,
-	MMAC_ADDR_POOL11,
-	MMAC_ADDR_POOL12,
-	MMAC_ADDR_POOL13,
-	MMAC_ADDR_POOL14,
-	MMAC_ADDR_POOL15,
-	MMAC_ADDR_POOL16,
-	MMAC_STATS_END
-} hxge_mmac_stat_index_t;
-
-hxge_kstat_index_t hxge_mmac_stats[] = {
-	{MMAC_MAX_ADDR, KSTAT_DATA_UINT64, "max_mmac_addr"},
-	{MMAC_AVAIL_ADDR, KSTAT_DATA_UINT64, "avail_mmac_addr"},
-	{MMAC_ADDR_POOL1, KSTAT_DATA_UINT64, "mmac_addr_1"},
-	{MMAC_ADDR_POOL2, KSTAT_DATA_UINT64, "mmac_addr_2"},
-	{MMAC_ADDR_POOL3, KSTAT_DATA_UINT64, "mmac_addr_3"},
-	{MMAC_ADDR_POOL4, KSTAT_DATA_UINT64, "mmac_addr_4"},
-	{MMAC_ADDR_POOL5, KSTAT_DATA_UINT64, "mmac_addr_5"},
-	{MMAC_ADDR_POOL6, KSTAT_DATA_UINT64, "mmac_addr_6"},
-	{MMAC_ADDR_POOL7, KSTAT_DATA_UINT64, "mmac_addr_7"},
-	{MMAC_ADDR_POOL8, KSTAT_DATA_UINT64, "mmac_addr_8"},
-	{MMAC_ADDR_POOL9, KSTAT_DATA_UINT64, "mmac_addr_9"},
-	{MMAC_ADDR_POOL10, KSTAT_DATA_UINT64, "mmac_addr_10"},
-	{MMAC_ADDR_POOL11, KSTAT_DATA_UINT64, "mmac_addr_11"},
-	{MMAC_ADDR_POOL12, KSTAT_DATA_UINT64, "mmac_addr_12"},
-	{MMAC_ADDR_POOL13, KSTAT_DATA_UINT64, "mmac_addr_13"},
-	{MMAC_ADDR_POOL14, KSTAT_DATA_UINT64, "mmac_addr_14"},
-	{MMAC_ADDR_POOL15, KSTAT_DATA_UINT64, "mmac_addr_15"},
-	{MMAC_ADDR_POOL16, KSTAT_DATA_UINT64, "mmac_addr_16"},
-	{MMAC_STATS_END, NULL, NULL},
-};
-
-typedef enum {
 	SPC_ACC_ERR = 0,
 	TDC_PIOACC_ERR,
 	RDC_PIOACC_ERR,
@@ -580,75 +536,6 @@ hxge_pfc_stat_update(kstat_t *ksp, int rw)
 	return (0);
 }
 
-static uint64_t
-hxge_mac_octet_to_u64(struct ether_addr addr)
-{
-	int		i;
-	uint64_t	addr64 = 0;
-
-	for (i = ETHERADDRL - 1; i >= 0; i--) {
-		addr64 <<= 8;
-		addr64 |= addr.ether_addr_octet[i];
-	}
-	return (addr64);
-}
-
-/* ARGSUSED */
-int
-hxge_mmac_stat_update(kstat_t *ksp, int rw)
-{
-	p_hxge_t		hxgep;
-	p_hxge_mmac_kstat_t	mmac_kstatsp;
-	p_hxge_mmac_stats_t	statsp;
-
-	hxgep = (p_hxge_t)ksp->ks_private;
-	if (hxgep == NULL)
-		return (-1);
-
-	HXGE_DEBUG_MSG((hxgep, KST_CTL, "==> hxge_mmac_stat_update"));
-
-	mmac_kstatsp = (p_hxge_mmac_kstat_t)ksp->ks_data;
-	statsp = (p_hxge_mmac_stats_t)&hxgep->statsp->mmac_stats;
-
-	mmac_kstatsp->mmac_max_addr_cnt.value.ul = statsp->mmac_max_cnt;
-	mmac_kstatsp->mmac_avail_addr_cnt.value.ul = statsp->mmac_avail_cnt;
-	mmac_kstatsp->mmac_addr1.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[0]);
-	mmac_kstatsp->mmac_addr2.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[1]);
-	mmac_kstatsp->mmac_addr3.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[2]);
-	mmac_kstatsp->mmac_addr4.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[3]);
-	mmac_kstatsp->mmac_addr5.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[4]);
-	mmac_kstatsp->mmac_addr6.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[5]);
-	mmac_kstatsp->mmac_addr7.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[6]);
-	mmac_kstatsp->mmac_addr8.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[7]);
-	mmac_kstatsp->mmac_addr9.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[8]);
-	mmac_kstatsp->mmac_addr10.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[9]);
-	mmac_kstatsp->mmac_addr11.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[10]);
-	mmac_kstatsp->mmac_addr12.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[11]);
-	mmac_kstatsp->mmac_addr13.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[12]);
-	mmac_kstatsp->mmac_addr14.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[13]);
-	mmac_kstatsp->mmac_addr15.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[14]);
-	mmac_kstatsp->mmac_addr16.value.ul =
-	    hxge_mac_octet_to_u64(statsp->mmac_avail_pool[15]);
-
-	HXGE_DEBUG_MSG((hxgep, KST_CTL, "<== hxge_mmac_stat_update"));
-	return (0);
-}
-
 /* ARGSUSED */
 int
 hxge_peu_sys_stat_update(kstat_t *ksp, int rw)
@@ -722,7 +609,6 @@ hxge_setup_kstats(p_hxge_t hxgep)
 	p_hxge_port_kstat_t	hxgekp;
 	size_t			hxge_kstat_sz;
 	char			stat_name[64];
-	char			mmac_name[64];
 	int			i;
 
 	HXGE_DEBUG_MSG((hxgep, KST_CTL, "==> hxge_setup_kstats"));
@@ -779,14 +665,6 @@ hxge_setup_kstats(p_hxge_t hxgep)
 	if (hxgep->statsp->vmac_ksp == NULL)
 		cmn_err(CE_WARN, "kstat_create failed for vmac");
 
-	/* Setup MMAC statistics */
-	(void) sprintf(mmac_name, "MMAC Stats%d", hxgep->instance);
-	hxgep->statsp->mmac_ksp = hxge_setup_local_kstat(hxgep,
-	    hxgep->instance, "MMAC",
-	    &hxge_mmac_stats[0], MMAC_STATS_END, hxge_mmac_stat_update);
-	if (hxgep->statsp->mmac_ksp == NULL)
-		cmn_err(CE_WARN, "kstat_create failed for mmac");
-
 	/* Setup PEU System statistics */
 	hxgep->statsp->peu_sys_ksp = hxge_setup_local_kstat(hxgep,
 	    hxgep->instance, "PEU", &hxge_peu_sys_stats[0],
diff --git a/usr/src/uts/common/io/hxge/hxge_main.c b/usr/src/uts/common/io/hxge/hxge_main.c
index b58bf49d8d..47a61060bf 100644
--- a/usr/src/uts/common/io/hxge/hxge_main.c
+++ b/usr/src/uts/common/io/hxge/hxge_main.c
@@ -151,13 +151,8 @@ static int hxge_m_unicst(void *, const uint8_t *);
 static int hxge_m_multicst(void *, boolean_t, const uint8_t *);
 static int hxge_m_promisc(void *, boolean_t);
 static void hxge_m_ioctl(void *, queue_t *, mblk_t *);
-static void hxge_m_resources(void *);
 static hxge_status_t hxge_mac_register(p_hxge_t hxgep);
 
-static int hxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr);
-static int hxge_m_mmac_remove(void *arg, mac_addr_slot_t slot);
-static int hxge_m_mmac_modify(void *arg, mac_multi_addr_t *maddr);
-static int hxge_m_mmac_get(void *arg, mac_multi_addr_t *maddr);
 static boolean_t hxge_m_getcapab(void *, mac_capab_t, void *);
 static boolean_t hxge_param_locked(mac_prop_id_t pr_num);
 static int hxge_m_setprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
@@ -196,7 +191,7 @@ mac_priv_prop_t hxge_priv_props[] = {
 #define	MAX_DUMP_SZ 256
 
 #define	HXGE_M_CALLBACK_FLAGS	\
-	(MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
+	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
 
 extern mblk_t *hxge_m_tx(void *arg, mblk_t *mp);
 extern hxge_status_t hxge_pfc_set_default_mac_addr(p_hxge_t hxgep);
@@ -210,7 +205,6 @@ static mac_callbacks_t hxge_m_callbacks = {
 	hxge_m_multicst,
 	hxge_m_unicst,
 	hxge_m_tx,
-	hxge_m_resources,
 	hxge_m_ioctl,
 	hxge_m_getcapab,
 	NULL,
@@ -2697,386 +2691,17 @@ hxge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
 	HXGE_DEBUG_MSG((hxgep, NEMO_CTL, "<== hxge_m_ioctl"));
 }
 
-extern void hxge_rx_hw_blank(void *arg, time_t ticks, uint_t count);
-
-static void
-hxge_m_resources(void *arg)
-{
-	p_hxge_t hxgep = arg;
-	mac_rx_fifo_t mrf;
-	p_rx_rcr_rings_t rcr_rings;
-	p_rx_rcr_ring_t *rcr_p;
-	p_rx_rcr_ring_t rcrp;
-	uint32_t i, ndmas;
-	int status;
-
-	HXGE_DEBUG_MSG((hxgep, RX_CTL, "==> hxge_m_resources"));
-
-	MUTEX_ENTER(hxgep->genlock);
-
-	if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) {
-		status = hxge_init(hxgep);
-		if (status != HXGE_OK) {
-			HXGE_DEBUG_MSG((hxgep, RX_CTL, "==> hxge_m_resources: "
-			    "hxge_init failed"));
-			MUTEX_EXIT(hxgep->genlock);
-			return;
-		}
-	}
-
-	mrf.mrf_type = MAC_RX_FIFO;
-	mrf.mrf_blank = hxge_rx_hw_blank;
-	mrf.mrf_arg = (void *)hxgep;
-
-	mrf.mrf_normal_blank_time = RXDMA_RCR_TO_DEFAULT;
-	mrf.mrf_normal_pkt_count = RXDMA_RCR_PTHRES_DEFAULT;
-
-	rcr_rings = hxgep->rx_rcr_rings;
-	rcr_p = rcr_rings->rcr_rings;
-	ndmas = rcr_rings->ndmas;
-
-	/*
-	 * Export our receive resources to the MAC layer.
-	 */
-	for (i = 0; i < ndmas; i++) {
-		rcrp = (void *)(p_rx_rcr_ring_t)rcr_p[i];
-		rcrp->rcr_mac_handle =
-		    mac_resource_add(hxgep->mach, (mac_resource_t *)&mrf);
-
-		HXGE_DEBUG_MSG((hxgep, RX_CTL,
-		    "==> hxge_m_resources: vdma %d dma %d "
-		    "rcrptr 0x%016llx mac_handle 0x%016llx",
-		    i, rcrp->rdc, rcr_p[i], rcrp->rcr_mac_handle));
-	}
-
-	MUTEX_EXIT(hxgep->genlock);
-
-	HXGE_DEBUG_MSG((hxgep, RX_CTL, "<== hxge_m_resources"));
-}
-
-/*
- * Set an alternate MAC address
- */
-static int
-hxge_altmac_set(p_hxge_t hxgep, uint8_t *maddr, mac_addr_slot_t slot)
-{
-	uint64_t	address;
-	uint64_t	tmp;
-	hpi_status_t	status;
-	uint8_t		addrn;
-	int		i;
-
-	/*
-	 * Convert a byte array to a 48 bit value.
-	 * Need to check endianess if in doubt
-	 */
-	address = 0;
-	for (i = 0; i < ETHERADDRL; i++) {
-		tmp = maddr[i];
-		address <<= 8;
-		address |= tmp;
-	}
-
-	addrn = (uint8_t)slot;
-	status = hpi_pfc_set_mac_address(hxgep->hpi_handle, addrn, address);
-	if (status != HPI_SUCCESS)
-		return (EIO);
-
-	return (0);
-}
-
-static void
-hxge_mmac_kstat_update(p_hxge_t hxgep, mac_addr_slot_t slot)
-{
-	p_hxge_mmac_stats_t	mmac_stats;
-	int			i;
-	hxge_mmac_t		*mmac_info;
-
-	mmac_info = &hxgep->hxge_mmac_info;
-	mmac_stats = &hxgep->statsp->mmac_stats;
-	mmac_stats->mmac_max_cnt = mmac_info->num_mmac;
-	mmac_stats->mmac_avail_cnt = mmac_info->naddrfree;
-
-	for (i = 0; i < ETHERADDRL; i++) {
-		mmac_stats->mmac_avail_pool[slot].ether_addr_octet[i] =
-		    mmac_info->mac_pool[slot].addr[(ETHERADDRL - 1) - i];
-	}
-}
-
-/*
- * Find an unused address slot, set the address value to the one specified,
- * enable the port to start filtering on the new MAC address.
- * Returns: 0 on success.
- */
-int
-hxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr)
-{
-	p_hxge_t	hxgep = arg;
-	mac_addr_slot_t	slot;
-	hxge_mmac_t	*mmac_info;
-	int		err;
-	hxge_status_t	status;
-
-	mutex_enter(hxgep->genlock);
-
-	/*
-	 * Make sure that hxge is initialized, if _start() has
-	 * not been called.
-	 */
-	if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) {
-		status = hxge_init(hxgep);
-		if (status != HXGE_OK) {
-			mutex_exit(hxgep->genlock);
-			return (ENXIO);
-		}
-	}
-
-	mmac_info = &hxgep->hxge_mmac_info;
-	if (mmac_info->naddrfree == 0) {
-		mutex_exit(hxgep->genlock);
-		return (ENOSPC);
-	}
-
-	if (!mac_unicst_verify(hxgep->mach, maddr->mma_addr,
-	    maddr->mma_addrlen)) {
-		mutex_exit(hxgep->genlock);
-		return (EINVAL);
-	}
-
-	/*
-	 * Search for the first available slot. Because naddrfree
-	 * is not zero, we are guaranteed to find one.
-	 * Slot 0 is for unique (primary) MAC.  The first alternate
-	 * MAC slot is slot 1.
-	 */
-	for (slot = 1; slot < mmac_info->num_mmac; slot++) {
-		if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
-			break;
-	}
-
-	ASSERT(slot < mmac_info->num_mmac);
-	if ((err = hxge_altmac_set(hxgep, maddr->mma_addr, slot)) != 0) {
-		mutex_exit(hxgep->genlock);
-		return (err);
-	}
-	bcopy(maddr->mma_addr, mmac_info->mac_pool[slot].addr, ETHERADDRL);
-	mmac_info->mac_pool[slot].flags |= MMAC_SLOT_USED;
-	mmac_info->naddrfree--;
-	hxge_mmac_kstat_update(hxgep, slot);
-
-	maddr->mma_slot = slot;
-
-	mutex_exit(hxgep->genlock);
-	return (0);
-}
-
-/*
- * Remove the specified mac address and update
- * the h/w not to filter the mac address anymore.
- * Returns: 0, on success.
- */
-int
-hxge_m_mmac_remove(void *arg, mac_addr_slot_t slot)
-{
-	p_hxge_t	hxgep = arg;
-	hxge_mmac_t	*mmac_info;
-	int		err = 0;
-	hxge_status_t	status;
-
-	mutex_enter(hxgep->genlock);
-
-	/*
-	 * Make sure that hxge is initialized, if _start() has
-	 * not been called.
-	 */
-	if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) {
-		status = hxge_init(hxgep);
-		if (status != HXGE_OK) {
-			mutex_exit(hxgep->genlock);
-			return (ENXIO);
-		}
-	}
-
-	mmac_info = &hxgep->hxge_mmac_info;
-	if (slot <= 0 || slot >= mmac_info->num_mmac) {
-		mutex_exit(hxgep->genlock);
-		return (EINVAL);
-	}
-
-	if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) {
-		if (hpi_pfc_mac_addr_disable(hxgep->hpi_handle, slot) ==
-		    HPI_SUCCESS) {
-			mmac_info->mac_pool[slot].flags &= ~MMAC_SLOT_USED;
-			mmac_info->naddrfree++;
-			/*
-			 * Clear mac_pool[slot].addr so that kstat shows 0
-			 * alternate MAC address if the slot is not used.
-			 */
-			bzero(mmac_info->mac_pool[slot].addr, ETHERADDRL);
-			hxge_mmac_kstat_update(hxgep, slot);
-		} else {
-			err = EIO;
-		}
-	} else {
-		err = EINVAL;
-	}
-
-	mutex_exit(hxgep->genlock);
-	return (err);
-}
-
-/*
- * Modify a mac address added by hxge_mmac_add().
- * Returns: 0, on success.
- */
-int
-hxge_m_mmac_modify(void *arg, mac_multi_addr_t *maddr)
-{
-	p_hxge_t	hxgep = arg;
-	mac_addr_slot_t	slot;
-	hxge_mmac_t	*mmac_info;
-	int		err = 0;
-	hxge_status_t	status;
-
-	if (!mac_unicst_verify(hxgep->mach, maddr->mma_addr,
-	    maddr->mma_addrlen))
-		return (EINVAL);
-
-	slot = maddr->mma_slot;
-
-	mutex_enter(hxgep->genlock);
-
-	/*
-	 * Make sure that hxge is initialized, if _start() has
-	 * not been called.
-	 */
-	if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) {
-		status = hxge_init(hxgep);
-		if (status != HXGE_OK) {
-			mutex_exit(hxgep->genlock);
-			return (ENXIO);
-		}
-	}
-
-	mmac_info = &hxgep->hxge_mmac_info;
-	if (slot <= 0 || slot >= mmac_info->num_mmac) {
-		mutex_exit(hxgep->genlock);
-		return (EINVAL);
-	}
-
-	if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) {
-		if ((err = hxge_altmac_set(hxgep, maddr->mma_addr,
-		    slot)) == 0) {
-			bcopy(maddr->mma_addr, mmac_info->mac_pool[slot].addr,
-			    ETHERADDRL);
-			hxge_mmac_kstat_update(hxgep, slot);
-		}
-	} else {
-		err = EINVAL;
-	}
-
-	mutex_exit(hxgep->genlock);
-	return (err);
-}
-
-/*
- * static int
- * hxge_m_mmac_get() - Get the MAC address and other information
- *	related to the slot.  mma_flags should be set to 0 in the call.
- *	Note: although kstat shows MAC address as zero when a slot is
- *	not used, Crossbow expects hxge_m_mmac_get to copy factory MAC
- *	to the caller as long as the slot is not using a user MAC address.
- *	The following table shows the rules,
- *
- *     					USED    VENDOR    mma_addr
- *	------------------------------------------------------------
- *	(1) Slot uses a user MAC:	yes      no     user MAC
- *	(2) Slot uses a factory MAC:    yes      yes    factory MAC
- *	(3) Slot is not used but is
- *	     factory MAC capable:	no       yes    factory MAC
- *	(4) Slot is not used and is
- *	     not factory MAC capable:   no       no	0
- *	------------------------------------------------------------
- */
-int
-hxge_m_mmac_get(void *arg, mac_multi_addr_t *maddr)
-{
-	hxge_t		*hxgep = arg;
-	mac_addr_slot_t	slot;
-	hxge_mmac_t	*mmac_info;
-	hxge_status_t	status;
-
-	slot = maddr->mma_slot;
-
-	mutex_enter(hxgep->genlock);
-
-	/*
-	 * Make sure that hxge is initialized, if _start() has
-	 * not been called.
-	 */
-	if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) {
-		status = hxge_init(hxgep);
-		if (status != HXGE_OK) {
-			mutex_exit(hxgep->genlock);
-			return (ENXIO);
-		}
-	}
-
-	mmac_info = &hxgep->hxge_mmac_info;
-	if (slot <= 0 || slot >= mmac_info->num_mmac) {
-		mutex_exit(hxgep->genlock);
-		return (EINVAL);
-	}
-
-	maddr->mma_flags = 0;
-	if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) {
-		maddr->mma_flags |= MMAC_SLOT_USED;
-		bcopy(mmac_info->mac_pool[slot].addr,
-		    maddr->mma_addr, ETHERADDRL);
-		maddr->mma_addrlen = ETHERADDRL;
-	}
-
-	mutex_exit(hxgep->genlock);
-	return (0);
-}
-
 /*ARGSUSED*/
 boolean_t
 hxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 {
-	p_hxge_t		hxgep = (p_hxge_t)arg;
 	uint32_t		*txflags = cap_data;
-	multiaddress_capab_t	*mmacp = cap_data;
 
 	switch (cap) {
 	case MAC_CAPAB_HCKSUM:
 		*txflags = HCKSUM_INET_PARTIAL;
 		break;
 
-	case MAC_CAPAB_POLL:
-		/*
-		 * There's nothing for us to fill in, simply returning B_TRUE
-		 * stating that we support polling is sufficient.
-		 */
-		break;
-
-	case MAC_CAPAB_MULTIADDRESS:
-		/*
-		 * The number of MAC addresses made available by
-		 * this capability is one less than the total as
-		 * the primary address in slot 0 is counted in
-		 * the total.
-		 */
-		mmacp->maddr_naddr = PFC_N_MAC_ADDRESSES - 1;
-		mmacp->maddr_naddrfree = hxgep->hxge_mmac_info.naddrfree;
-		mmacp->maddr_flag = 0;	/* No multiple factory macs */
-		mmacp->maddr_handle = hxgep;
-		mmacp->maddr_add = hxge_m_mmac_add;
-		mmacp->maddr_remove = hxge_m_mmac_remove;
-		mmacp->maddr_modify = hxge_m_mmac_modify;
-		mmacp->maddr_get = hxge_m_mmac_get;
-		mmacp->maddr_reserve = NULL;	/* No multiple factory macs */
-		break;
 	default:
 		return (B_FALSE);
 	}
diff --git a/usr/src/uts/common/io/hxge/hxge_rxdma.c b/usr/src/uts/common/io/hxge/hxge_rxdma.c
index 0c3747f6bd..2de507a8e9 100644
--- a/usr/src/uts/common/io/hxge/hxge_rxdma.c
+++ b/usr/src/uts/common/io/hxge/hxge_rxdma.c
@@ -1228,10 +1228,8 @@ hxge_rx_pkts_vring(p_hxge_t hxgep, uint_t vindex, p_hxge_ldv_t ldvp,
 #ifdef  HXGE_DEBUG
 	HXGE_DEBUG_MSG((hxgep, RX_CTL,
 	    "==> hxge_rx_pkts_vring:calling mac_rx (NEMO) "
-	    "LEN %d mp $%p mp->b_next $%p rcrp $%p "
-	    "mac_handle $%p",
-	    (mp->b_wptr - mp->b_rptr), mp, mp->b_next,
-	    rcrp, rcrp->rcr_mac_handle));
+	    "LEN %d mp $%p mp->b_next $%p rcrp $%p",
+	    (mp->b_wptr - mp->b_rptr), mp, mp->b_next, rcrp));
 	HXGE_DEBUG_MSG((hxgep, RX_CTL,
 	    "==> hxge_rx_pkts_vring: dump packets "
 	    "(mp $%p b_rptr $%p b_wptr $%p):\n %s",
@@ -1257,7 +1255,7 @@ hxge_rx_pkts_vring(p_hxge_t hxgep, uint_t vindex, p_hxge_ldv_t ldvp,
 
 	HXGE_DEBUG_MSG((hxgep, RX_CTL,
 	    "==> hxge_rx_pkts_vring: send packet to stack"));
-	mac_rx(hxgep->mach, rcrp->rcr_mac_handle, mp);
+	mac_rx(hxgep->mach, NULL, mp);
 
 	HXGE_DEBUG_MSG((hxgep, RX_CTL, "<== hxge_rx_pkts_vring"));
 }
diff --git a/usr/src/uts/common/io/hxge/hxge_rxdma.h b/usr/src/uts/common/io/hxge/hxge_rxdma.h
index c5277ca590..0d1808a67c 100644
--- a/usr/src/uts/common/io/hxge/hxge_rxdma.h
+++ b/usr/src/uts/common/io/hxge/hxge_rxdma.h
@@ -344,7 +344,6 @@ typedef struct _rx_rcr_ring_t {
 	uint32_t		intr_timeout;
 	uint32_t		intr_threshold;
 	uint64_t		max_receive_pkts;
-	mac_resource_handle_t	rcr_mac_handle;
 	uint32_t		rcvd_pkt_bytes; /* Received bytes of a packet */
 } rx_rcr_ring_t, *p_rx_rcr_ring_t;
 
diff --git a/usr/src/uts/common/io/hxge/hxge_virtual.c b/usr/src/uts/common/io/hxge/hxge_virtual.c
index b1eff782aa..bbc65993d0 100644
--- a/usr/src/uts/common/io/hxge/hxge_virtual.c
+++ b/usr/src/uts/common/io/hxge/hxge_virtual.c
@@ -36,7 +36,6 @@ static void hxge_set_hw_dma_config(p_hxge_t);
 static void hxge_set_hw_class_config(p_hxge_t);
 static void hxge_ldgv_setup(p_hxge_ldg_t *ldgp, p_hxge_ldv_t *ldvp, uint8_t ldv,
 	uint8_t endldg, int *ngrps);
-static hxge_status_t hxge_mmac_init(p_hxge_t);
 
 extern uint16_t hxge_rcr_timeout;
 extern uint16_t hxge_rcr_threshold;
@@ -894,35 +893,11 @@ hxge_intr_mask_mgmt_set(p_hxge_t hxgep, boolean_t on)
 static hxge_status_t
 hxge_get_mac_addr_properties(p_hxge_t hxgep)
 {
-	uint32_t	num_macs;
-	hxge_status_t	status;
-
 	HXGE_DEBUG_MSG((hxgep, DDI_CTL, "==> hxge_get_mac_addr_properties "));
 
 	(void) hxge_pfc_mac_addrs_get(hxgep);
 	hxgep->ouraddr = hxgep->factaddr;
 
-	/*
-	 * Get the number of MAC addresses the Hydra supports per blade.
-	 */
-	if (hxge_pfc_num_macs_get(hxgep, &num_macs) == HXGE_OK) {
-		hxgep->hxge_mmac_info.num_mmac = (uint8_t)num_macs;
-	} else {
-		HXGE_ERROR_MSG((NULL, HXGE_ERR_CTL,
-		    "hxge_get_mac_addr_properties: get macs failed"));
-		return (HXGE_ERROR);
-	}
-
-	/*
-	 * Initialize alt. mac addr. in the mac pool
-	 */
-	status = hxge_mmac_init(hxgep);
-	if (status != HXGE_OK) {
-		HXGE_ERROR_MSG((NULL, HXGE_ERR_CTL,
-		    "hxge_get_mac_addr_properties: init mmac failed"));
-		return (HXGE_ERROR);
-	}
-
 	HXGE_DEBUG_MSG((hxgep, DDI_CTL, "<== hxge_get_mac_addr_properties "));
 	return (HXGE_OK);
 }
@@ -971,49 +946,3 @@ hxge_ldgv_setup(p_hxge_ldg_t *ldgp, p_hxge_ldv_t *ldvp, uint8_t ldv,
 
 	HXGE_DEBUG_MSG((NULL, INT_CTL, "<== hxge_ldgv_setup"));
 }
-
-/*
- * Note: This function assumes the following distribution of mac
- * addresses for a hydra blade:
- *
- *      -------------
- *    0|            |0 - local-mac-address for blade
- *      -------------
- *     |            |1 - Start of alt. mac addr. for blade
- *     |            |
- *     |            |
- *     |            |15
- *     --------------
- */
-
-static hxge_status_t
-hxge_mmac_init(p_hxge_t hxgep)
-{
-	int slot;
-	hxge_mmac_t *mmac_info;
-
-	mmac_info = (hxge_mmac_t *)&hxgep->hxge_mmac_info;
-
-	/* Set flags for unique MAC */
-	mmac_info->mac_pool[0].flags |= MMAC_SLOT_USED | MMAC_VENDOR_ADDR;
-	mmac_info->num_factory_mmac = 1;
-
-	/*
-	 * Skip the factory/default address which is in slot 0.
-	 * Initialze all other mac addr. to "AVAILABLE" state.
-	 * Clear flags of all alternate MAC slots.
-	 */
-	for (slot = 1; slot < mmac_info->num_mmac; slot++) {
-		(void) hpi_pfc_clear_mac_address(hxgep->hpi_handle, slot);
-		mmac_info->mac_pool[slot].flags = 0;
-	}
-
-	/* Exclude the factory mac address */
-	mmac_info->naddrfree = mmac_info->num_mmac - 1;
-
-	/* Initialize the first two parameters for mmac kstat */
-	hxgep->statsp->mmac_stats.mmac_max_cnt = mmac_info->num_mmac;
-	hxgep->statsp->mmac_stats.mmac_avail_cnt = mmac_info->naddrfree;
-
-	return (HXGE_OK);
-}
diff --git a/usr/src/uts/common/io/ib/clients/ibd/ibd.c b/usr/src/uts/common/io/ib/clients/ibd/ibd.c
index 099e2036c8..7992e1007b 100644
--- a/usr/src/uts/common/io/ib/clients/ibd/ibd.c
+++ b/usr/src/uts/common/io/ib/clients/ibd/ibd.c
@@ -37,6 +37,7 @@
 #include <sys/strsun.h>
 #include <sys/strsubr.h>
 #include <sys/dlpi.h>
+#include <sys/mac_provider.h>
 
 #include <sys/pattr.h>		/* for HCK_PARTIALCKSUM */
 #include <sys/sysmacros.h>	/* for offsetof */
@@ -310,7 +311,6 @@ static mac_callbacks_t ib_m_callbacks = {
 	ibd_m_unicst,
 	ibd_m_tx,
 	NULL,
-	NULL,
 	ibd_m_getcapab
 };
 
@@ -4102,13 +4102,6 @@ ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 			return (B_FALSE);
 		break;
 	}
-	case MAC_CAPAB_POLL:
-		/*
-		 * Fallthrough to default, as we don't support GLDv3
-		 * polling.  When blanking is implemented, we will need to
-		 * change this to return B_TRUE in addition to registering
-		 * an mc_resources callback.
-		 */
 	default:
 		return (B_FALSE);
 	}
diff --git a/usr/src/uts/common/io/igb/igb.conf b/usr/src/uts/common/io/igb/igb.conf
index c2ae8d4cd3..93860209f0 100644
--- a/usr/src/uts/common/io/igb/igb.conf
+++ b/usr/src/uts/common/io/igb/igb.conf
@@ -1,19 +1,17 @@
 #
 # CDDL HEADER START
 #
-# Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
 # The contents of this file are subject to the terms of the
 # Common Development and Distribution License (the "License").
 # You may not use this file except in compliance with the License.
 #
-# You can obtain a copy of the license at:
-#	http://www.opensolaris.org/os/licensing.
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
 # See the License for the specific language governing permissions
 # and limitations under the License.
 #
-# When using or redistributing this file, you may do so under the
-# License only. No other modification of this header is permitted.
-#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 # If applicable, add the following below this CDDL HEADER, with the
 # fields enclosed by brackets "[]" replaced with your own identifying
 # information: Portions Copyright [yyyy] [name of copyright owner]
@@ -21,11 +19,11 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms of the CDDL.
+# Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
 #
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
 #
 #
 # Driver.conf file for Intel 1Gb ethernet driver (igb)
@@ -121,29 +119,29 @@
 # flow_control		= 3;
 #
 # -------------------- Transmit/Receive Queues --------------------
-# tx_queue_number
-#	The number of the transmit queues
-#	Allowed values:	1 - 4
-#	Default value:	1
 #
 # tx_ring_size
 #	The number of the transmit descriptors per transmit queue
 #	Allowed values:	64 - 4096
 #	Default value:	512
 #
-# rx_queue_number
-#	The number of the receive queues
-#	Allowed values:	1 - 4
-#	Default value:	1
-#
 # rx_ring_size
 #	The number of the receive descriptors per receive queue
 #	Allowed values:	64 - 4096
 #	Default value:	512
 #
-# Note: The final values of tx_queue_number and rx_queue_number are decided
-# by the number of interrupt vectors obtained by the driver. They could be
-# less than the specified values because of limited interrupt vector number.
+# mr_enable
+#	Enable multiple rx queues and tx queues
+#	Allowed values:	0, 1
+#	Default value:	1
+#
+# rx_group_number
+#	The number of the receive ring groups
+#	Allowed values:	1 - 4
+#	Default value:	1
+#
+# Note: If the specified values of the rx_group_number are not supported by
+# hardware, the rx_group_number will be downgrade to an acceptable value.
 #
 # -------- How to set parameters for a particular interface ---------
 # The example below shows how to locate the device path and set a parameter
diff --git a/usr/src/uts/common/io/igb/igb_gld.c b/usr/src/uts/common/io/igb/igb_gld.c
index d897a484e3..c1213647ec 100644
--- a/usr/src/uts/common/io/igb/igb_gld.c
+++ b/usr/src/uts/common/io/igb/igb_gld.c
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *	http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
 
 #include "igb_sw.h"
 
@@ -555,37 +555,6 @@ igb_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr)
 }
 
 /*
- * Set a new device unicast address.
- */
-int
-igb_m_unicst(void *arg, const uint8_t *mac_addr)
-{
-	igb_t *igb = (igb_t *)arg;
-	int result;
-
-	mutex_enter(&igb->gen_lock);
-
-	if (igb->igb_state & IGB_SUSPENDED) {
-		mutex_exit(&igb->gen_lock);
-		return (ECANCELED);
-	}
-
-	/*
-	 * Store the new MAC address.
-	 */
-	bcopy(mac_addr, igb->hw.mac.addr, ETHERADDRL);
-
-	/*
-	 * Set MAC address in address slot 0, which is the default address.
-	 */
-	result = igb_unicst_set(igb, mac_addr, 0);
-
-	mutex_exit(&igb->gen_lock);
-
-	return (result);
-}
-
-/*
  * Pass on M_IOCTL messages passed to the DLD, and support
  * private IOCTLs for debugging and ndd.
  */
@@ -654,18 +623,16 @@ igb_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
 	}
 }
 
-
 /*
- * Find an unused address slot, set the address to it, reserve
- * this slot and enable the device to start filtering on the
- * new address.
+ * Add a MAC address to the target RX group.
  */
-int
-igb_m_unicst_add(void *arg, mac_multi_addr_t *maddr)
+static int
+igb_addmac(void *arg, const uint8_t *mac_addr)
 {
-	igb_t *igb = (igb_t *)arg;
-	mac_addr_slot_t slot;
-	int err;
+	igb_rx_group_t *rx_group = (igb_rx_group_t *)arg;
+	igb_t *igb = rx_group->igb;
+	struct e1000_hw *hw = &igb->hw;
+	int i, slot;
 
 	mutex_enter(&igb->gen_lock);
 
@@ -674,12 +641,6 @@ igb_m_unicst_add(void *arg, mac_multi_addr_t *maddr)
 		return (ECANCELED);
 	}
 
-	if (mac_unicst_verify(igb->mac_hdl,
-	    maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) {
-		mutex_exit(&igb->gen_lock);
-		return (EINVAL);
-	}
-
 	if (igb->unicst_avail == 0) {
 		/* no slots available */
 		mutex_exit(&igb->gen_lock);
@@ -687,39 +648,55 @@ igb_m_unicst_add(void *arg, mac_multi_addr_t *maddr)
 	}
 
 	/*
-	 * Primary/default address is in slot 0. The next addresses
-	 * are the multiple MAC addresses. So multiple MAC address 0
-	 * is in slot 1, 1 in slot 2, and so on. So the first multiple
-	 * MAC address resides in slot 1.
+	 * The slots from 0 to igb->num_rx_groups are reserved slots which
+	 * are 1 to 1 mapped with group index directly. The other slots are
+	 * shared between the all of groups. While adding a MAC address,
+	 * it will try to set the reserved slots first, then the shared slots.
 	 */
-	for (slot = 1; slot < igb->unicst_total; slot++) {
-		if (igb->unicst_addr[slot].mac.set == 0)
-			break;
-	}
+	slot = -1;
+	if (igb->unicst_addr[rx_group->index].mac.set == 1) {
+		/*
+		 * The reserved slot for current group is used, find the free
+		 * slots in the shared slots.
+		 */
+		for (i = igb->num_rx_groups; i < igb->unicst_total; i++) {
+			if (igb->unicst_addr[i].mac.set == 0) {
+				slot = i;
+				break;
+			}
+		}
+	} else
+		slot = rx_group->index;
 
-	ASSERT((slot > 0) && (slot < igb->unicst_total));
+	if (slot == -1) {
+		/* no slots available in the shared slots */
+		mutex_exit(&igb->gen_lock);
+		return (ENOSPC);
+	}
 
-	maddr->mma_slot = slot;
+	/* Set VMDq according to the mode supported by hardware. */
+	e1000_rar_set_vmdq(hw, mac_addr, slot, igb->vmdq_mode, rx_group->index);
 
-	if ((err = igb_unicst_set(igb, maddr->mma_addr, slot)) == 0) {
-		igb->unicst_addr[slot].mac.set = 1;
-		igb->unicst_avail--;
-	}
+	bcopy(mac_addr, igb->unicst_addr[slot].mac.addr, ETHERADDRL);
+	igb->unicst_addr[slot].mac.group_index = rx_group->index;
+	igb->unicst_addr[slot].mac.set = 1;
+	igb->unicst_avail--;
 
 	mutex_exit(&igb->gen_lock);
 
-	return (err);
+	return (0);
 }
 
-
 /*
- * Removes a MAC address that was added before.
+ * Remove a MAC address from the specified RX group.
  */
-int
-igb_m_unicst_remove(void *arg, mac_addr_slot_t slot)
+static int
+igb_remmac(void *arg, const uint8_t *mac_addr)
 {
-	igb_t *igb = (igb_t *)arg;
-	int err;
+	igb_rx_group_t *rx_group = (igb_rx_group_t *)arg;
+	igb_t *igb = rx_group->igb;
+	struct e1000_hw *hw = &igb->hw;
+	int slot;
 
 	mutex_enter(&igb->gen_lock);
 
@@ -728,7 +705,8 @@ igb_m_unicst_remove(void *arg, mac_addr_slot_t slot)
 		return (ECANCELED);
 	}
 
-	if ((slot <= 0) || (slot >= igb->unicst_total)) {
+	slot = igb_unicst_find(igb, mac_addr);
+	if (slot == -1) {
 		mutex_exit(&igb->gen_lock);
 		return (EINVAL);
 	}
@@ -738,104 +716,189 @@ igb_m_unicst_remove(void *arg, mac_addr_slot_t slot)
 		return (EINVAL);
 	}
 
-	/* Copy the default address to the passed slot */
-	if ((err = igb_unicst_set(igb,
-	    igb->unicst_addr[0].mac.addr, slot)) == 0) {
-		igb->unicst_addr[slot].mac.set = 0;
-		igb->unicst_avail++;
-	}
+	/* Clear the MAC ddress in the slot */
+	e1000_rar_clear(hw, slot);
+	igb->unicst_addr[slot].mac.set = 0;
+	igb->unicst_avail++;
 
 	mutex_exit(&igb->gen_lock);
 
-	return (err);
+	return (0);
 }
 
 /*
- * Modifies the value of an address that has been added before.
- * The new address length and the slot number that was returned
- * in the call to add should be passed in. mma_flags should be
- * set to 0.
- * Returns 0 on success.
+ * Enable interrupt on the specificed rx ring.
  */
 int
-igb_m_unicst_modify(void *arg, mac_multi_addr_t *maddr)
+igb_rx_ring_intr_enable(mac_intr_handle_t intrh)
 {
-	igb_t *igb = (igb_t *)arg;
-	mac_addr_slot_t slot;
-	int err;
-
-	mutex_enter(&igb->gen_lock);
+	igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)intrh;
+	igb_t *igb = rx_ring->igb;
+	struct e1000_hw *hw = &igb->hw;
+	uint32_t index = rx_ring->index;
 
-	if (igb->igb_state & IGB_SUSPENDED) {
-		mutex_exit(&igb->gen_lock);
-		return (ECANCELED);
+	if (igb->intr_type == DDI_INTR_TYPE_MSIX) {
+		/* Interrupt enabling for MSI-X */
+		igb->eims_mask |= (E1000_EICR_RX_QUEUE0 << index);
+		E1000_WRITE_REG(hw, E1000_EIMS, igb->eims_mask);
+		E1000_WRITE_REG(hw, E1000_EIAC, igb->eims_mask);
+	} else {
+		ASSERT(index == 0);
+		/* Interrupt enabling for MSI and legacy */
+		igb->ims_mask |= E1000_IMS_RXT0;
+		E1000_WRITE_REG(hw, E1000_IMS, igb->ims_mask);
 	}
 
-	if (mac_unicst_verify(igb->mac_hdl,
-	    maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) {
-		mutex_exit(&igb->gen_lock);
-		return (EINVAL);
-	}
+	E1000_WRITE_FLUSH(hw);
 
-	slot = maddr->mma_slot;
+	return (0);
+}
 
-	if ((slot <= 0) || (slot >= igb->unicst_total)) {
-		mutex_exit(&igb->gen_lock);
-		return (EINVAL);
+/*
+ * Disable interrupt on the specificed rx ring.
+ */
+int
+igb_rx_ring_intr_disable(mac_intr_handle_t intrh)
+{
+	igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)intrh;
+	igb_t *igb = rx_ring->igb;
+	struct e1000_hw *hw = &igb->hw;
+	uint32_t index = rx_ring->index;
+
+	if (igb->intr_type == DDI_INTR_TYPE_MSIX) {
+		/* Interrupt disabling for MSI-X */
+		igb->eims_mask &= ~(E1000_EICR_RX_QUEUE0 << index);
+		E1000_WRITE_REG(hw, E1000_EIMC,
+		    (E1000_EICR_RX_QUEUE0 << index));
+		E1000_WRITE_REG(hw, E1000_EIAC, igb->eims_mask);
+	} else {
+		ASSERT(index == 0);
+		/* Interrupt disabling for MSI and legacy */
+		igb->ims_mask &= ~E1000_IMS_RXT0;
+		E1000_WRITE_REG(hw, E1000_IMC, E1000_IMS_RXT0);
 	}
 
-	if (igb->unicst_addr[slot].mac.set == 0) {
-		mutex_exit(&igb->gen_lock);
-		return (EINVAL);
+	E1000_WRITE_FLUSH(hw);
+
+	return (0);
+}
+
+/*
+ * Get the global ring index by a ring index within a group.
+ */
+int
+igb_get_rx_ring_index(igb_t *igb, int gindex, int rindex)
+{
+	igb_rx_ring_t *rx_ring;
+	int i;
+
+	for (i = 0; i < igb->num_rx_rings; i++) {
+		rx_ring = &igb->rx_rings[i];
+		if (rx_ring->group_index == gindex)
+			rindex--;
+		if (rindex < 0)
+			return (i);
 	}
 
-	err = igb_unicst_set(igb, maddr->mma_addr, slot);
+	return (-1);
+}
 
-	mutex_exit(&igb->gen_lock);
+static int
+igb_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
+{
+	igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)rh;
 
-	return (err);
+	mutex_enter(&rx_ring->rx_lock);
+	rx_ring->ring_gen_num = mr_gen_num;
+	mutex_exit(&rx_ring->rx_lock);
+	return (0);
 }
 
 /*
- * Get the MAC address and all other information related to
- * the address slot passed in mac_multi_addr_t.
- * mma_flags should be set to 0 in the call.
- * On return, mma_flags can take the following values:
- * 1) MMAC_SLOT_UNUSED
- * 2) MMAC_SLOT_USED | MMAC_VENDOR_ADDR
- * 3) MMAC_SLOT_UNUSED | MMAC_VENDOR_ADDR
- * 4) MMAC_SLOT_USED
+ * Callback funtion for MAC layer to register all rings.
  */
-int
-igb_m_unicst_get(void *arg, mac_multi_addr_t *maddr)
+/* ARGSUSED */
+void
+igb_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
+    const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
 {
 	igb_t *igb = (igb_t *)arg;
-	mac_addr_slot_t slot;
+	mac_intr_t *mintr = &infop->mri_intr;
 
-	mutex_enter(&igb->gen_lock);
+	switch (rtype) {
+	case MAC_RING_TYPE_RX: {
+		igb_rx_ring_t *rx_ring;
+		int global_index;
 
-	if (igb->igb_state & IGB_SUSPENDED) {
-		mutex_exit(&igb->gen_lock);
-		return (ECANCELED);
-	}
+		/*
+		 * 'index' is the ring index within the group.
+		 * We need the global ring index by searching in group.
+		 */
+		global_index = igb_get_rx_ring_index(igb, rg_index, index);
 
-	slot = maddr->mma_slot;
+		ASSERT(global_index >= 0);
 
-	if ((slot <= 0) || (slot >= igb->unicst_total)) {
-		mutex_exit(&igb->gen_lock);
-		return (EINVAL);
+		rx_ring = &igb->rx_rings[global_index];
+		rx_ring->ring_handle = rh;
+
+		infop->mri_driver = (mac_ring_driver_t)rx_ring;
+		infop->mri_start = igb_ring_start;
+		infop->mri_stop = NULL;
+		infop->mri_poll = (mac_ring_poll_t)igb_rx_ring_poll;
+
+		mintr->mi_handle = (mac_intr_handle_t)rx_ring;
+		mintr->mi_enable = igb_rx_ring_intr_enable;
+		mintr->mi_disable = igb_rx_ring_intr_disable;
+
+		break;
 	}
+	case MAC_RING_TYPE_TX: {
+		ASSERT(index < igb->num_tx_rings);
 
-	if (igb->unicst_addr[slot].mac.set == 1) {
-		bcopy(igb->unicst_addr[slot].mac.addr,
-		    maddr->mma_addr, ETHERADDRL);
-		maddr->mma_flags = MMAC_SLOT_USED;
-	} else {
-		maddr->mma_flags = MMAC_SLOT_UNUSED;
+		igb_tx_ring_t *tx_ring = &igb->tx_rings[index];
+		tx_ring->ring_handle = rh;
+
+		infop->mri_driver = (mac_ring_driver_t)tx_ring;
+		infop->mri_start = NULL;
+		infop->mri_stop = NULL;
+		infop->mri_tx = igb_tx_ring_send;
+
+		break;
 	}
-	mutex_exit(&igb->gen_lock);
+	default:
+		break;
+	}
+}
 
-	return (0);
+void
+igb_fill_group(void *arg, mac_ring_type_t rtype, const int index,
+    mac_group_info_t *infop, mac_group_handle_t gh)
+{
+	igb_t *igb = (igb_t *)arg;
+
+	switch (rtype) {
+	case MAC_RING_TYPE_RX: {
+		igb_rx_group_t *rx_group;
+
+		ASSERT((index >= 0) && (index < igb->num_rx_groups));
+
+		rx_group = &igb->rx_groups[index];
+		rx_group->group_handle = gh;
+
+		infop->mgi_driver = (mac_group_driver_t)rx_group;
+		infop->mgi_start = NULL;
+		infop->mgi_stop = NULL;
+		infop->mgi_addmac = igb_addmac;
+		infop->mgi_remmac = igb_remmac;
+		infop->mgi_count = (igb->num_rx_rings / igb->num_rx_groups);
+
+		break;
+	}
+	case MAC_RING_TYPE_TX:
+		break;
+	default:
+		break;
+	}
 }
 
 /*
@@ -863,27 +926,34 @@ igb_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 		*tx_hcksum_flags = HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM;
 		break;
 	}
-	case MAC_CAPAB_MULTIADDRESS: {
-		multiaddress_capab_t *mmacp = cap_data;
+	case MAC_CAPAB_RINGS: {
+		mac_capab_rings_t *cap_rings = cap_data;
+
+		switch (cap_rings->mr_type) {
+		case MAC_RING_TYPE_RX:
+			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+			cap_rings->mr_rnum = igb->num_rx_rings;
+			cap_rings->mr_gnum = igb->num_rx_groups;
+			cap_rings->mr_rget = igb_fill_ring;
+			cap_rings->mr_gget = igb_fill_group;
+			cap_rings->mr_gaddring = NULL;
+			cap_rings->mr_gremring = NULL;
 
-		/*
-		 * The number of MAC addresses made available by
-		 * this capability is one less than the total as
-		 * the primary address in slot 0 is counted in
-		 * the total.
-		 */
-		mmacp->maddr_naddr = igb->unicst_total - 1;
-		mmacp->maddr_naddrfree = igb->unicst_avail;
-		/* No multiple factory addresses, set mma_flag to 0 */
-		mmacp->maddr_flag = 0;
-		mmacp->maddr_handle = igb;
-		mmacp->maddr_add = igb_m_unicst_add;
-		mmacp->maddr_remove = igb_m_unicst_remove;
-		mmacp->maddr_modify = igb_m_unicst_modify;
-		mmacp->maddr_get = igb_m_unicst_get;
-		mmacp->maddr_reserve = NULL;
+			break;
+		case MAC_RING_TYPE_TX:
+			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+			cap_rings->mr_rnum = igb->num_tx_rings;
+			cap_rings->mr_gnum = 0;
+			cap_rings->mr_rget = igb_fill_ring;
+			cap_rings->mr_gget = NULL;
+
+			break;
+		default:
+			break;
+		}
 		break;
 	}
+
 	default:
 		return (B_FALSE);
 	}
diff --git a/usr/src/uts/common/io/igb/igb_hw.h b/usr/src/uts/common/io/igb/igb_hw.h
index 814b0c09fb..04c410d7d1 100644
--- a/usr/src/uts/common/io/igb/igb_hw.h
+++ b/usr/src/uts/common/io/igb/igb_hw.h
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *	http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,8 +20,12 @@
  */
 
 /*
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
+ */
+
+/*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Use is subject to license terms.
  */
 
 /* IntelVersion: 1.357 v2007-12-10_dragonlake5 */
@@ -31,8 +33,6 @@
 #ifndef _IGB_HW_H
 #define	_IGB_HW_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -620,6 +620,9 @@ s32  e1000_read_pcie_cap_reg(struct e1000_hw *hw, u32 reg, u16 *value);
 void e1000_free_dev_spec_struct(struct e1000_hw *hw);
 void e1000_read_pci_cfg(struct e1000_hw *hw, u32 reg, u16 *value);
 void e1000_write_pci_cfg(struct e1000_hw *hw, u32 reg, u16 *value);
+void e1000_rar_clear(struct e1000_hw *hw, uint32_t);
+void e1000_rar_set_vmdq(struct e1000_hw *hw, const uint8_t *, uint32_t,
+    uint32_t, uint8_t);
 
 #ifdef __cplusplus
 }
diff --git a/usr/src/uts/common/io/igb/igb_main.c b/usr/src/uts/common/io/igb/igb_main.c
index 18a7050e7e..ed475f0014 100644
--- a/usr/src/uts/common/io/igb/igb_main.c
+++ b/usr/src/uts/common/io/igb/igb_main.c
@@ -60,6 +60,8 @@ static void igb_setup_tx(igb_t *);
 static void igb_setup_rx_ring(igb_rx_ring_t *);
 static void igb_setup_tx_ring(igb_tx_ring_t *);
 static void igb_setup_rss(igb_t *);
+static void igb_setup_mac_rss_classify(igb_t *);
+static void igb_setup_mac_classify(igb_t *);
 static void igb_init_unicst(igb_t *);
 static void igb_setup_multicst(igb_t *);
 static void igb_get_phy_state(igb_t *);
@@ -93,10 +95,11 @@ static void igb_setup_adapter_msix(igb_t *);
 static uint_t igb_intr_legacy(void *, void *);
 static uint_t igb_intr_msi(void *, void *);
 static uint_t igb_intr_rx(void *, void *);
+static uint_t igb_intr_tx(void *, void *);
 static uint_t igb_intr_tx_other(void *, void *);
 static void igb_intr_rx_work(igb_rx_ring_t *);
 static void igb_intr_tx_work(igb_tx_ring_t *);
-static void igb_intr_other_work(igb_t *);
+static void igb_intr_link_work(igb_t *);
 static void igb_get_driver_control(struct e1000_hw *);
 static void igb_release_driver_control(struct e1000_hw *);
 
@@ -175,14 +178,12 @@ static mac_callbacks_t igb_m_callbacks = {
 	igb_m_stop,
 	igb_m_promisc,
 	igb_m_multicst,
-	igb_m_unicst,
-	igb_m_tx,
+	NULL,
 	NULL,
 	igb_m_ioctl,
 	igb_m_getcapab
 };
 
-
 /*
  * Module Initialization Functions
  */
@@ -339,7 +340,7 @@ igb_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
 	 * interrupts are allocated.
 	 */
 	if (igb_alloc_rings(igb) != IGB_SUCCESS) {
-		igb_error(igb, "Failed to allocate rx and tx rings");
+		igb_error(igb, "Failed to allocate rx/tx rings or groups");
 		goto attach_fail;
 	}
 	igb->attach_progress |= ATTACH_PROGRESS_ALLOC_RINGS;
@@ -378,10 +379,13 @@ igb_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
 	/*
 	 * Initialize chipset hardware
 	 */
+	mutex_enter(&igb->gen_lock);
 	if (igb_init(igb) != IGB_SUCCESS) {
+		mutex_exit(&igb->gen_lock);
 		igb_error(igb, "Failed to initialize adapter");
 		goto attach_fail;
 	}
+	mutex_exit(&igb->gen_lock);
 	igb->attach_progress |= ATTACH_PROGRESS_INIT;
 
 	/*
@@ -710,6 +714,7 @@ igb_register_mac(igb_t *igb)
 	mac->m_max_sdu = igb->max_frame_size -
 	    sizeof (struct ether_vlan_header) - ETHERFCSL;
 	mac->m_margin = VLAN_TAGSZ;
+	mac->m_v12n = MAC_VIRT_LEVEL1;
 
 	status = mac_register(mac, &igb->mac_hdl);
 
@@ -1019,7 +1024,7 @@ igb_init(igb_t *igb)
 	uint32_t pba;
 	uint32_t high_water;
 
-	mutex_enter(&igb->gen_lock);
+	ASSERT(mutex_owned(&igb->gen_lock));
 
 	/*
 	 * Reset chipset to put the hardware in a known state
@@ -1121,7 +1126,6 @@ igb_init(igb_t *igb)
 		goto init_fail;
 	}
 
-	mutex_exit(&igb->gen_lock);
 	return (IGB_SUCCESS);
 
 init_fail:
@@ -1131,8 +1135,6 @@ init_fail:
 	if (e1000_check_reset_block(hw) == E1000_SUCCESS)
 		(void) e1000_phy_hw_reset(hw);
 
-	mutex_exit(&igb->gen_lock);
-
 	ddi_fm_service_impact(igb->dip, DDI_SERVICE_LOST);
 
 	return (IGB_FAILURE);
@@ -1541,9 +1543,12 @@ igb_start(igb_t *igb)
 	/*
 	 * Start the chipset hardware
 	 */
-	if (igb_chip_start(igb) != IGB_SUCCESS) {
-		igb_fm_ereport(igb, DDI_FM_DEVICE_INVAL_STATE);
-		goto start_failure;
+	if (!(igb->attach_progress & ATTACH_PROGRESS_INIT)) {
+		if (igb_init(igb) != IGB_SUCCESS) {
+			igb_fm_ereport(igb, DDI_FM_DEVICE_INVAL_STATE);
+			goto start_failure;
+		}
+		igb->attach_progress |= ATTACH_PROGRESS_INIT;
 	}
 
 	/*
@@ -1591,6 +1596,8 @@ igb_stop(igb_t *igb)
 
 	ASSERT(mutex_owned(&igb->gen_lock));
 
+	igb->attach_progress &= ~ ATTACH_PROGRESS_INIT;
+
 	/*
 	 * Disable the adapter interrupts
 	 */
@@ -1656,6 +1663,23 @@ igb_alloc_rings(igb_t *igb)
 		return (IGB_FAILURE);
 	}
 
+	/*
+	 * Allocate memory space for rx ring groups
+	 */
+	igb->rx_groups = kmem_zalloc(
+	    sizeof (igb_rx_group_t) * igb->num_rx_groups,
+	    KM_NOSLEEP);
+
+	if (igb->rx_groups == NULL) {
+		kmem_free(igb->rx_rings,
+		    sizeof (igb_rx_ring_t) * igb->num_rx_rings);
+		kmem_free(igb->tx_rings,
+		    sizeof (igb_tx_ring_t) * igb->num_tx_rings);
+		igb->rx_rings = NULL;
+		igb->tx_rings = NULL;
+		return (IGB_FAILURE);
+	}
+
 	return (IGB_SUCCESS);
 }
 
@@ -1676,6 +1700,12 @@ igb_free_rings(igb_t *igb)
 		    sizeof (igb_tx_ring_t) * igb->num_tx_rings);
 		igb->tx_rings = NULL;
 	}
+
+	if (igb->rx_groups != NULL) {
+		kmem_free(igb->rx_groups,
+		    sizeof (igb_rx_group_t) * igb->num_rx_groups);
+		igb->rx_groups = NULL;
+	}
 }
 
 /*
@@ -1782,8 +1812,10 @@ static void
 igb_setup_rx(igb_t *igb)
 {
 	igb_rx_ring_t *rx_ring;
+	igb_rx_group_t *rx_group;
 	struct e1000_hw *hw = &igb->hw;
 	uint32_t reg_val;
+	uint32_t ring_per_group;
 	int i;
 
 	/*
@@ -1804,12 +1836,24 @@ igb_setup_rx(igb_t *igb)
 
 	E1000_WRITE_REG(hw, E1000_RCTL, reg_val);
 
+	for (i = 0; i < igb->num_rx_groups; i++) {
+		rx_group = &igb->rx_groups[i];
+		rx_group->index = i;
+		rx_group->igb = igb;
+	}
+
 	/*
 	 * igb_setup_rx_ring must be called after configuring RCTL
 	 */
+	ring_per_group = igb->num_rx_rings / igb->num_rx_groups;
 	for (i = 0; i < igb->num_rx_rings; i++) {
 		rx_ring = &igb->rx_rings[i];
 		igb_setup_rx_ring(rx_ring);
+
+		/*
+		 * Map a ring to a group by assigning a group index
+		 */
+		rx_ring->group_index = i / ring_per_group;
 	}
 
 	/*
@@ -1829,10 +1873,32 @@ igb_setup_rx(igb_t *igb)
 	}
 
 	/*
-	 * Setup RSS for multiple receive queues
+	 * Setup classify and RSS for multiple receive queues
 	 */
-	if (igb->num_rx_rings > 1)
-		igb_setup_rss(igb);
+	switch (igb->vmdq_mode) {
+	case E1000_VMDQ_OFF:
+		/*
+		 * One ring group, only RSS is needed when more than
+		 * one ring enabled.
+		 */
+		if (igb->num_rx_rings > 1)
+			igb_setup_rss(igb);
+		break;
+	case E1000_VMDQ_MAC:
+		/*
+		 * Multiple groups, each group has one ring,
+		 * only the MAC classification is needed.
+		 */
+		igb_setup_mac_classify(igb);
+		break;
+	case E1000_VMDQ_MAC_RSS:
+		/*
+		 * Multiple groups and multiple rings, both
+		 * MAC classification and RSS are needed.
+		 */
+		igb_setup_mac_rss_classify(igb);
+		break;
+	}
 }
 
 static void
@@ -1848,6 +1914,7 @@ igb_setup_tx_ring(igb_tx_ring_t *tx_ring)
 	ASSERT(mutex_owned(&tx_ring->tx_lock));
 	ASSERT(mutex_owned(&igb->gen_lock));
 
+
 	/*
 	 * Initialize the length register
 	 */
@@ -1922,6 +1989,14 @@ igb_setup_tx_ring(igb_tx_ring_t *tx_ring)
 	}
 
 	/*
+	 * Enable specific tx ring, it is required by multiple tx
+	 * ring support.
+	 */
+	reg_val = E1000_READ_REG(hw, E1000_TXDCTL(tx_ring->index));
+	reg_val |= E1000_TXDCTL_QUEUE_ENABLE;
+	E1000_WRITE_REG(hw, E1000_TXDCTL(tx_ring->index), reg_val);
+
+	/*
 	 * Initialize hardware checksum offload settings
 	 */
 	tx_ring->hcksum_context.hcksum_flags = 0;
@@ -2036,6 +2111,117 @@ igb_setup_rss(igb_t *igb)
 }
 
 /*
+ * igb_setup_mac_rss_classify - Setup MAC classification and rss
+ */
+static void
+igb_setup_mac_rss_classify(igb_t *igb)
+{
+	struct e1000_hw *hw = &igb->hw;
+	uint32_t i, mrqc, vmdctl, rxcsum;
+	uint32_t ring_per_group;
+	int shift_group0, shift_group1;
+	uint32_t random;
+	union e1000_reta {
+		uint32_t	dword;
+		uint8_t		bytes[4];
+	} reta;
+
+	ring_per_group = igb->num_rx_rings / igb->num_rx_groups;
+
+	/* Setup the Redirection Table, it is shared between two groups */
+	shift_group0 = 2;
+	shift_group1 = 6;
+	for (i = 0; i < (32 * 4); i++) {
+		reta.bytes[i & 3] = ((i % ring_per_group) << shift_group0) |
+		    ((ring_per_group + (i % ring_per_group)) << shift_group1);
+		if ((i & 3) == 3) {
+			E1000_WRITE_REG(hw,
+			    (E1000_RETA(0) + (i & ~3)), reta.dword);
+		}
+	}
+
+	/* Fill out hash function seeds */
+	for (i = 0; i < 10; i++) {
+		(void) random_get_pseudo_bytes((uint8_t *)&random,
+		    sizeof (uint32_t));
+		E1000_WRITE_REG(hw, E1000_RSSRK(i), random);
+	}
+
+	/*
+	 * Setup the Multiple Receive Queue Control register,
+	 * enable VMDq based on packet destination MAC address and RSS.
+	 */
+	mrqc = E1000_MRQC_ENABLE_VMDQ_MAC_RSS_GROUP;
+	mrqc |= (E1000_MRQC_RSS_FIELD_IPV4 |
+	    E1000_MRQC_RSS_FIELD_IPV4_TCP |
+	    E1000_MRQC_RSS_FIELD_IPV6 |
+	    E1000_MRQC_RSS_FIELD_IPV6_TCP |
+	    E1000_MRQC_RSS_FIELD_IPV4_UDP |
+	    E1000_MRQC_RSS_FIELD_IPV6_UDP |
+	    E1000_MRQC_RSS_FIELD_IPV6_UDP_EX |
+	    E1000_MRQC_RSS_FIELD_IPV6_TCP_EX);
+
+	E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
+
+
+	/* Define the default group and default queues */
+	vmdctl = E1000_VMDQ_MAC_GROUP_DEFAULT_QUEUE;
+	E1000_WRITE_REG(hw, E1000_VMD_CTL, vmdctl);
+
+	/*
+	 * Disable Packet Checksum to enable RSS for multiple receive queues.
+	 *
+	 * The Packet Checksum is not ethernet CRC. It is another kind of
+	 * checksum offloading provided by the 82575 chipset besides the IP
+	 * header checksum offloading and the TCP/UDP checksum offloading.
+	 * The Packet Checksum is by default computed over the entire packet
+	 * from the first byte of the DA through the last byte of the CRC,
+	 * including the Ethernet and IP headers.
+	 *
+	 * It is a hardware limitation that Packet Checksum is mutually
+	 * exclusive with RSS.
+	 */
+	rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
+	rxcsum |= E1000_RXCSUM_PCSD;
+	E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
+}
+
+/*
+ * igb_setup_mac_classify - Setup MAC classification feature
+ */
+static void
+igb_setup_mac_classify(igb_t *igb)
+{
+	struct e1000_hw *hw = &igb->hw;
+	uint32_t mrqc, rxcsum;
+
+	/*
+	 * Setup the Multiple Receive Queue Control register,
+	 * enable VMDq based on packet destination MAC address.
+	 */
+	mrqc = E1000_MRQC_ENABLE_VMDQ_MAC_GROUP;
+	E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
+
+	/*
+	 * Disable Packet Checksum to enable RSS for multiple receive queues.
+	 *
+	 * The Packet Checksum is not ethernet CRC. It is another kind of
+	 * checksum offloading provided by the 82575 chipset besides the IP
+	 * header checksum offloading and the TCP/UDP checksum offloading.
+	 * The Packet Checksum is by default computed over the entire packet
+	 * from the first byte of the DA through the last byte of the CRC,
+	 * including the Ethernet and IP headers.
+	 *
+	 * It is a hardware limitation that Packet Checksum is mutually
+	 * exclusive with RSS.
+	 */
+	rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
+	rxcsum |= E1000_RXCSUM_PCSD;
+	E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
+
+}
+
+/*
  * igb_init_unicst - Initialize the unicast addresses
  */
 static void
@@ -2049,41 +2235,39 @@ igb_init_unicst(igb_t *igb)
 	 *
 	 * 1. Chipset is initialized the first time
 	 *    Initialize the multiple unicast addresses, and
-	 *    save the default mac address.
+	 *    save the default MAC address.
 	 *
 	 * 2. Chipset is reset
 	 *    Recover the multiple unicast addresses from the
 	 *    software data structure to the RAR registers.
 	 */
-	if (!igb->unicst_init) {
-		/* Initialize the multiple unicast addresses */
-		igb->unicst_total = MAX_NUM_UNICAST_ADDRESSES;
 
-		igb->unicst_avail = igb->unicst_total - 1;
+	/*
+	 * Clear the default MAC address in the RAR0 rgister,
+	 * which is loaded from EEPROM when system boot or chipreset,
+	 * this will cause the conficts with add_mac/rem_mac entry
+	 * points when VMDq is enabled. For this reason, the RAR0
+	 * must be cleared for both cases mentioned above.
+	 */
+	e1000_rar_clear(hw, 0);
 
-		/* Store the default mac address */
-		e1000_rar_set(hw, hw->mac.addr, 0);
+	if (!igb->unicst_init) {
 
-		bcopy(hw->mac.addr, igb->unicst_addr[0].mac.addr,
-		    ETHERADDRL);
-		igb->unicst_addr[0].mac.set = 1;
+		/* Initialize the multiple unicast addresses */
+		igb->unicst_total = MAX_NUM_UNICAST_ADDRESSES;
+		igb->unicst_avail = igb->unicst_total;
 
-		for (slot = 1; slot < igb->unicst_total; slot++)
+		for (slot = 0; slot < igb->unicst_total; slot++)
 			igb->unicst_addr[slot].mac.set = 0;
 
 		igb->unicst_init = B_TRUE;
 	} else {
-		/* Recover the default mac address */
-		bcopy(igb->unicst_addr[0].mac.addr, hw->mac.addr,
-		    ETHERADDRL);
-
-		/* Store the default mac address */
-		e1000_rar_set(hw, hw->mac.addr, 0);
-
 		/* Re-configure the RAR registers */
-		for (slot = 1; slot < igb->unicst_total; slot++)
-			e1000_rar_set(hw,
-			    igb->unicst_addr[slot].mac.addr, slot);
+		for (slot = 0; slot < igb->unicst_total; slot++) {
+			e1000_rar_set_vmdq(hw, igb->unicst_addr[slot].mac.addr,
+			    slot, igb->vmdq_mode,
+			    igb->unicst_addr[slot].mac.group_index);
+		}
 	}
 
 	if (igb_check_acc_handle(igb->osdep.reg_handle) != DDI_FM_OK)
@@ -2091,11 +2275,30 @@ igb_init_unicst(igb_t *igb)
 }
 
 /*
+ * igb_unicst_find - Find the slot for the specified unicast address
+ */
+int
+igb_unicst_find(igb_t *igb, const uint8_t *mac_addr)
+{
+	int slot;
+
+	ASSERT(mutex_owned(&igb->gen_lock));
+
+	for (slot = 0; slot < igb->unicst_total; slot++) {
+		if (bcmp(igb->unicst_addr[slot].mac.addr,
+		    mac_addr, ETHERADDRL) == 0)
+			return (slot);
+	}
+
+	return (-1);
+}
+
+/*
  * igb_unicst_set - Set the unicast address to the specified slot
  */
 int
 igb_unicst_set(igb_t *igb, const uint8_t *mac_addr,
-    mac_addr_slot_t slot)
+    int slot)
 {
 	struct e1000_hw *hw = &igb->hw;
 
@@ -2232,6 +2435,8 @@ igb_get_conf(igb_t *igb)
 	struct e1000_hw *hw = &igb->hw;
 	uint32_t default_mtu;
 	uint32_t flow_control;
+	uint32_t ring_per_group;
+	int i;
 
 	/*
 	 * igb driver supports the following user configurations:
@@ -2299,16 +2504,66 @@ igb_get_conf(igb_t *igb)
 	/*
 	 * Multiple rings configurations
 	 */
-	igb->num_tx_rings = igb_get_prop(igb, PROP_TX_QUEUE_NUM,
-	    MIN_TX_QUEUE_NUM, MAX_TX_QUEUE_NUM, DEFAULT_TX_QUEUE_NUM);
 	igb->tx_ring_size = igb_get_prop(igb, PROP_TX_RING_SIZE,
 	    MIN_TX_RING_SIZE, MAX_TX_RING_SIZE, DEFAULT_TX_RING_SIZE);
-
-	igb->num_rx_rings = igb_get_prop(igb, PROP_RX_QUEUE_NUM,
-	    MIN_RX_QUEUE_NUM, MAX_RX_QUEUE_NUM, DEFAULT_RX_QUEUE_NUM);
 	igb->rx_ring_size = igb_get_prop(igb, PROP_RX_RING_SIZE,
 	    MIN_RX_RING_SIZE, MAX_RX_RING_SIZE, DEFAULT_RX_RING_SIZE);
 
+	igb->mr_enable = igb_get_prop(igb, PROP_MR_ENABLE, 0, 1, 1);
+	igb->num_rx_groups = igb_get_prop(igb, PROP_RX_GROUP_NUM,
+	    MIN_RX_GROUP_NUM, MAX_RX_GROUP_NUM, DEFAULT_RX_GROUP_NUM);
+
+	if (igb->mr_enable) {
+		igb->num_tx_rings = DEFAULT_TX_QUEUE_NUM;
+		igb->num_rx_rings = DEFAULT_RX_QUEUE_NUM;
+	} else {
+		igb->num_tx_rings = 1;
+		igb->num_rx_rings = 1;
+
+		if (igb->num_rx_groups > 1) {
+			igb_error(igb,
+			    "Invalid rx groups number. Please enable multiple "
+			    "rings first");
+			igb->num_rx_groups = 1;
+		}
+	}
+
+	/*
+	 * Check the divisibility between rx rings and rx groups.
+	 */
+	for (i = igb->num_rx_groups; i > 0; i--) {
+		if ((igb->num_rx_rings % i) == 0)
+			break;
+	}
+	if (i != igb->num_rx_groups) {
+		igb_error(igb,
+		    "Invalid rx groups number. Downgrade the rx group "
+		    "number to %d.", i);
+		igb->num_rx_groups = i;
+	}
+
+	/*
+	 * Get the ring number per group.
+	 */
+	ring_per_group = igb->num_rx_rings / igb->num_rx_groups;
+
+	if (igb->num_rx_groups == 1) {
+		/*
+		 * One rx ring group, the rx ring number is num_rx_rings.
+		 */
+		igb->vmdq_mode = E1000_VMDQ_OFF;
+	} else if (ring_per_group == 1) {
+		/*
+		 * Multiple rx groups, each group has one rx ring.
+		 */
+		igb->vmdq_mode = E1000_VMDQ_MAC;
+	} else {
+		/*
+		 * Multiple groups and multiple rings.
+		 */
+		igb->vmdq_mode = E1000_VMDQ_MAC_RSS;
+	}
+
 	/*
 	 * Tunable used to force an interrupt type. The only use is
 	 * for testing of the lesser interrupt types.
@@ -2861,6 +3116,7 @@ igb_enable_adapter_interrupts(igb_t *igb)
 		/* Interrupt enabling for MSI-X */
 		E1000_WRITE_REG(hw, E1000_EIMS, igb->eims_mask);
 		E1000_WRITE_REG(hw, E1000_EIAC, igb->eims_mask);
+		igb->ims_mask = E1000_IMS_LSC;
 		E1000_WRITE_REG(hw, E1000_IMS, E1000_IMS_LSC);
 
 		/* Enable MSI-X PBA support */
@@ -2873,6 +3129,7 @@ igb_enable_adapter_interrupts(igb_t *igb)
 		E1000_WRITE_REG(hw, E1000_CTRL_EXT, reg);
 	} else {
 		/* Interrupt enabling for MSI and legacy */
+		igb->ims_mask = IMS_ENABLE_MASK;
 		E1000_WRITE_REG(hw, E1000_IMS, IMS_ENABLE_MASK);
 	}
 
@@ -3176,11 +3433,12 @@ igb_intr_rx_work(igb_rx_ring_t *rx_ring)
 	mblk_t *mp;
 
 	mutex_enter(&rx_ring->rx_lock);
-	mp = igb_rx(rx_ring);
+	mp = igb_rx(rx_ring, IGB_NO_POLL);
 	mutex_exit(&rx_ring->rx_lock);
 
 	if (mp != NULL)
-		mac_rx(rx_ring->igb->mac_hdl, NULL, mp);
+		mac_rx_ring(rx_ring->igb->mac_hdl, rx_ring->ring_handle, mp,
+		    rx_ring->ring_gen_num);
 }
 
 #pragma inline(igb_intr_tx_work)
@@ -3197,17 +3455,17 @@ igb_intr_tx_work(igb_tx_ring_t *tx_ring)
 	if (tx_ring->reschedule &&
 	    (tx_ring->tbd_free >= tx_ring->resched_thresh)) {
 		tx_ring->reschedule = B_FALSE;
-		mac_tx_update(tx_ring->igb->mac_hdl);
+		mac_tx_ring_update(tx_ring->igb->mac_hdl, tx_ring->ring_handle);
 		IGB_DEBUG_STAT(tx_ring->stat_reschedule);
 	}
 }
 
-#pragma inline(igb_intr_other_work)
+#pragma inline(igb_intr_link_work)
 /*
- * igb_intr_other_work - other processing of ISR
+ * igb_intr_link_work - link-status-change processing of ISR
  */
 static void
-igb_intr_other_work(igb_t *igb)
+igb_intr_link_work(igb_t *igb)
 {
 	boolean_t link_changed;
 
@@ -3273,7 +3531,7 @@ igb_intr_legacy(void *arg1, void *arg2)
 		ASSERT(igb->num_tx_rings == 1);
 
 		if (icr & E1000_ICR_RXT0) {
-			mp = igb_rx(&igb->rx_rings[0]);
+			mp = igb_rx(&igb->rx_rings[0], IGB_NO_POLL);
 		}
 
 		if (icr & E1000_ICR_TXDW) {
@@ -3320,7 +3578,7 @@ igb_intr_legacy(void *arg1, void *arg2)
 
 	if (tx_reschedule)  {
 		tx_ring->reschedule = B_FALSE;
-		mac_tx_update(igb->mac_hdl);
+		mac_tx_ring_update(igb->mac_hdl, tx_ring->ring_handle);
 		IGB_DEBUG_STAT(tx_ring->stat_reschedule);
 	}
 
@@ -3359,7 +3617,7 @@ igb_intr_msi(void *arg1, void *arg2)
 	}
 
 	if (icr & E1000_ICR_LSC) {
-		igb_intr_other_work(igb);
+		igb_intr_link_work(igb);
 	}
 
 	return (DDI_INTR_CLAIMED);
@@ -3385,10 +3643,27 @@ igb_intr_rx(void *arg1, void *arg2)
 }
 
 /*
+ * igb_intr_tx - Interrupt handler for tx
+ */
+static uint_t
+igb_intr_tx(void *arg1, void *arg2)
+{
+	igb_tx_ring_t *tx_ring = (igb_tx_ring_t *)arg1;
+
+	_NOTE(ARGUNUSED(arg2));
+
+	/*
+	 * Only used via MSI-X vector so don't check cause bits
+	 * and only clean the given ring.
+	 */
+	igb_intr_tx_work(tx_ring);
+
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
  * igb_intr_tx_other - Interrupt handler for both tx and other
  *
- * Always look for Tx cleanup work.  Only look for other work if the right
- * bits are set in the Interrupt Cause Register.
  */
 static uint_t
 igb_intr_tx_other(void *arg1, void *arg2)
@@ -3401,17 +3676,18 @@ igb_intr_tx_other(void *arg1, void *arg2)
 	icr = E1000_READ_REG(&igb->hw, E1000_ICR);
 
 	/*
-	 * Always look for Tx cleanup work.  We don't have separate
-	 * transmit vectors, so we have only one tx ring enabled.
+	 * Look for tx reclaiming work first. Remember, in the
+	 * case of only interrupt sharing, only one tx ring is
+	 * used
 	 */
-	ASSERT(igb->num_tx_rings == 1);
 	igb_intr_tx_work(&igb->tx_rings[0]);
 
 	/*
-	 * Check for "other" causes.
+	 * Need check cause bits and only link change will
+	 * be processed
 	 */
 	if (icr & E1000_ICR_LSC) {
-		igb_intr_other_work(igb);
+		igb_intr_link_work(igb);
 	}
 
 	return (DDI_INTR_CLAIMED);
@@ -3504,23 +3780,12 @@ static int
 igb_alloc_intr_handles(igb_t *igb, int intr_type)
 {
 	dev_info_t *devinfo;
-	int request, count, avail, actual;
-	int rx_rings, minimum;
+	int orig, request, count, avail, actual;
+	int diff, minimum;
 	int rc;
 
 	devinfo = igb->dip;
 
-	/*
-	 * Currently only 1 tx ring is supported. More tx rings
-	 * will be supported with future enhancement.
-	 */
-	if (igb->num_tx_rings > 1) {
-		igb->num_tx_rings = 1;
-		igb_log(igb,
-		    "Use only 1 MSI-X vector for tx, "
-		    "force tx queue number to 1");
-	}
-
 	switch (intr_type) {
 	case DDI_INTR_TYPE_FIXED:
 		request = 1;	/* Request 1 legacy interrupt handle */
@@ -3536,12 +3801,12 @@ igb_alloc_intr_handles(igb_t *igb, int intr_type)
 
 	case DDI_INTR_TYPE_MSIX:
 		/*
-		 * Best number of vectors for the adapter is
-		 * # rx rings + # tx rings + 1 for other
-		 * But currently we only support number of vectors of
-		 * # rx rings + 1 for tx & other
+		 * Number of vectors for the adapter is
+		 * # rx rings + # tx rings
+		 * One of tx vectors is for tx & other
 		 */
-		request = igb->num_rx_rings + 1;
+		request = igb->num_rx_rings + igb->num_tx_rings;
+		orig = request;
 		minimum = 2;
 		IGB_DEBUGLOG_0(igb, "interrupt type: MSI-X");
 		break;
@@ -3613,15 +3878,24 @@ igb_alloc_intr_handles(igb_t *igb, int intr_type)
 	}
 
 	/*
-	 * For MSI-X, actual might force us to reduce number of rx rings
+	 * For MSI-X, actual might force us to reduce number of tx & rx rings
 	 */
-	if (intr_type == DDI_INTR_TYPE_MSIX) {
-		rx_rings = actual - 1;
-		if (rx_rings < igb->num_rx_rings) {
+	if ((intr_type == DDI_INTR_TYPE_MSIX) && (orig > actual)) {
+		diff = orig - actual;
+		if (diff < igb->num_tx_rings) {
+			igb_log(igb,
+			    "MSI-X vectors force Tx queue number to %d",
+			    igb->num_tx_rings - diff);
+			igb->num_tx_rings -= diff;
+		} else {
+			igb_log(igb,
+			    "MSI-X vectors force Tx queue number to 1");
+			igb->num_tx_rings = 1;
+
 			igb_log(igb,
 			    "MSI-X vectors force Rx queue number to %d",
-			    rx_rings);
-			igb->num_rx_rings = rx_rings;
+			    actual - 1);
+			igb->num_rx_rings = actual - 1;
 		}
 	}
 
@@ -3662,6 +3936,7 @@ static int
 igb_add_intr_handlers(igb_t *igb)
 {
 	igb_rx_ring_t *rx_ring;
+	igb_tx_ring_t *tx_ring;
 	int vector;
 	int rc;
 	int i;
@@ -3671,14 +3946,17 @@ igb_add_intr_handlers(igb_t *igb)
 	switch (igb->intr_type) {
 	case DDI_INTR_TYPE_MSIX:
 		/* Add interrupt handler for tx + other */
+		tx_ring = &igb->tx_rings[0];
 		rc = ddi_intr_add_handler(igb->htable[vector],
 		    (ddi_intr_handler_t *)igb_intr_tx_other,
 		    (void *)igb, NULL);
+
 		if (rc != DDI_SUCCESS) {
 			igb_log(igb,
 			    "Add tx/other interrupt handler failed: %d", rc);
 			return (IGB_FAILURE);
 		}
+		tx_ring->intr_vector = vector;
 		vector++;
 
 		/* Add interrupt handler for each rx ring */
@@ -3704,6 +3982,31 @@ igb_add_intr_handlers(igb_t *igb)
 
 			vector++;
 		}
+
+		/* Add interrupt handler for each tx ring from 2nd ring */
+		for (i = 1; i < igb->num_tx_rings; i++) {
+			tx_ring = &igb->tx_rings[i];
+
+			rc = ddi_intr_add_handler(igb->htable[vector],
+			    (ddi_intr_handler_t *)igb_intr_tx,
+			    (void *)tx_ring, NULL);
+
+			if (rc != DDI_SUCCESS) {
+				igb_log(igb,
+				    "Add tx interrupt handler failed. "
+				    "return: %d, tx ring: %d", rc, i);
+				for (vector--; vector >= 0; vector--) {
+					(void) ddi_intr_remove_handler(
+					    igb->htable[vector]);
+				}
+				return (IGB_FAILURE);
+			}
+
+			tx_ring->intr_vector = vector;
+
+			vector++;
+		}
+
 		break;
 
 	case DDI_INTR_TYPE_MSI:
@@ -3764,14 +4067,14 @@ igb_setup_adapter_msix(igb_t *igb)
 	struct e1000_hw *hw = &igb->hw;
 
 	/*
-	 * Set vector for Tx + Other causes
-	 * NOTE assumption that there is only one of these and it is vector 0
+	 * Set vector for other causes, NOTE assumption that it is vector 0
 	 */
 	vector = 0;
+
 	igb->eims_mask = E1000_EICR_TX_QUEUE0 | E1000_EICR_OTHER;
 	E1000_WRITE_REG(hw, E1000_MSIXBM(vector), igb->eims_mask);
-
 	vector++;
+
 	for (i = 0; i < igb->num_rx_rings; i++) {
 		/*
 		 * Set vector for each rx ring
@@ -3787,6 +4090,21 @@ igb_setup_adapter_msix(igb_t *igb)
 		vector++;
 	}
 
+	for (i = 1; i < igb->num_tx_rings; i++) {
+		/*
+		 * Set vector for each tx ring from 2nd tx ring
+		 */
+		eims = (E1000_EICR_TX_QUEUE0 << i);
+		E1000_WRITE_REG(hw, E1000_MSIXBM(vector), eims);
+
+		/*
+		 * Accumulate bits to enable in igb_enable_adapter_interrupts()
+		 */
+		igb->eims_mask |= eims;
+
+		vector++;
+	}
+
 	ASSERT(vector == igb->intr_cnt);
 
 	/*
diff --git a/usr/src/uts/common/io/igb/igb_osdep.c b/usr/src/uts/common/io/igb/igb_osdep.c
index 9d03c05494..f915edd5ae 100644
--- a/usr/src/uts/common/io/igb/igb_osdep.c
+++ b/usr/src/uts/common/io/igb/igb_osdep.c
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *	http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
 
 #include "igb_osdep.h"
 #include "igb_api.h"
@@ -114,3 +114,61 @@ e1000_enable_pciex_master(struct e1000_hw *hw)
 	ctrl &= ~E1000_CTRL_GIO_MASTER_DISABLE;
 	E1000_WRITE_REG(hw, E1000_CTRL, ctrl);
 }
+
+/*
+ * e1000_rar_set_vmdq - Clear the RAR registers
+ */
+void
+e1000_rar_clear(struct e1000_hw *hw, uint32_t index)
+{
+
+	uint32_t rar_high;
+
+	/* Make the hardware the Address invalid by setting the clear bit */
+	rar_high = ~E1000_RAH_AV;
+
+	E1000_WRITE_REG_ARRAY(hw, E1000_RA, ((index << 1) + 1), rar_high);
+	E1000_WRITE_FLUSH(hw);
+}
+
+/*
+ * e1000_rar_set_vmdq - Set the RAR registers for VMDq
+ */
+void
+e1000_rar_set_vmdq(struct e1000_hw *hw, const uint8_t *addr, uint32_t index,
+	uint32_t vmdq_mode, uint8_t qsel)
+{
+	uint32_t rar_low, rar_high;
+
+	/*
+	 * NIC expects these in little endian so reverse the byte order
+	 * from network order (big endian) to little endian.
+	 */
+
+	rar_low = ((uint32_t)addr[0] | ((uint32_t)addr[1] << 8) |
+	    ((uint32_t)addr[2] << 16) | ((uint32_t)addr[3] << 24));
+
+	rar_high = ((uint32_t)addr[4] | ((uint32_t)addr[5] << 8));
+
+	/* Indicate to hardware the Address is Valid. */
+	rar_high |= E1000_RAH_AV;
+
+	/* Set que selector based on vmdq mode */
+	switch (vmdq_mode) {
+	default:
+	case E1000_VMDQ_OFF:
+		break;
+	case E1000_VMDQ_MAC:
+		rar_high |= (qsel << 18);
+		break;
+	case E1000_VMDQ_MAC_RSS:
+		rar_high |= 1 << (18 + qsel);
+		break;
+
+	}
+
+	/* write to receive address registers */
+	E1000_WRITE_REG_ARRAY(hw, E1000_RA, (index << 1), rar_low);
+	E1000_WRITE_REG_ARRAY(hw, E1000_RA, ((index << 1) + 1), rar_high);
+	E1000_WRITE_FLUSH(hw);
+}
diff --git a/usr/src/uts/common/io/igb/igb_osdep.h b/usr/src/uts/common/io/igb/igb_osdep.h
index 42ba27a2e3..f56f320a1c 100644
--- a/usr/src/uts/common/io/igb/igb_osdep.h
+++ b/usr/src/uts/common/io/igb/igb_osdep.h
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *	http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,15 +20,17 @@
  */
 
 /*
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
+ */
+
+/*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Use is subject to license terms.
  */
 
 #ifndef	_IGB_OSDEP_H
 #define	_IGB_OSDEP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -96,6 +96,18 @@ extern "C" {
 #define	IEEE_ESR_1000X_HD_CAPS	0x4000	/* 1000X HD capable */
 #define	IEEE_ESR_1000X_FD_CAPS	0x8000	/* 1000X FD capable */
 
+/* VMDq MODE supported by hardware */
+#define	E1000_VMDQ_OFF		0
+#define	E1000_VMDQ_MAC		1
+#define	E1000_VMDQ_MAC_RSS	2
+
+/* VMDq based on packet destination MAC address */
+#define	E1000_MRQC_ENABLE_VMDQ_MAC_GROUP	0x00000003
+/* VMDq based on packet destination MAC address and RSS */
+#define	E1000_MRQC_ENABLE_VMDQ_MAC_RSS_GROUP	0x00000005
+/* The default queue in each VMDqs */
+#define	E1000_VMDQ_MAC_GROUP_DEFAULT_QUEUE	0x100
+
 #define	E1000_WRITE_FLUSH(a)	(void) E1000_READ_REG(a, E1000_STATUS)
 
 #define	E1000_WRITE_REG(hw, reg, value)	\
diff --git a/usr/src/uts/common/io/igb/igb_rx.c b/usr/src/uts/common/io/igb/igb_rx.c
index ec04dc6b8e..acf15ed35c 100644
--- a/usr/src/uts/common/io/igb/igb_rx.c
+++ b/usr/src/uts/common/io/igb/igb_rx.c
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *	http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
 
 #include "igb_sw.h"
 
@@ -251,6 +251,24 @@ igb_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error)
 	}
 }
 
+mblk_t *
+igb_rx_ring_poll(void *arg, int bytes)
+{
+	igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)arg;
+	mblk_t *mp = NULL;
+
+	ASSERT(bytes >= 0);
+
+	if (bytes == 0)
+		return (mp);
+
+	mutex_enter(&rx_ring->rx_lock);
+	mp = igb_rx(rx_ring, bytes);
+	mutex_exit(&rx_ring->rx_lock);
+
+	return (mp);
+}
+
 /*
  * igb_rx - Receive the data of one ring
  *
@@ -260,7 +278,7 @@ igb_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error)
  * passed up to mac_rx().
  */
 mblk_t *
-igb_rx(igb_rx_ring_t *rx_ring)
+igb_rx(igb_rx_ring_t *rx_ring, int poll_bytes)
 {
 	union e1000_adv_rx_desc *current_rbd;
 	rx_control_block_t *current_rcb;
@@ -272,6 +290,7 @@ igb_rx(igb_rx_ring_t *rx_ring)
 	uint32_t pkt_len;
 	uint32_t status_error;
 	uint32_t pkt_num;
+	uint32_t total_bytes;
 	igb_t *igb = rx_ring->igb;
 
 	mblk_head = NULL;
@@ -296,6 +315,7 @@ igb_rx(igb_rx_ring_t *rx_ring)
 
 	current_rbd = &rx_ring->rbd_ring[rx_next];
 	pkt_num = 0;
+	total_bytes = 0;
 	status_error = current_rbd->wb.upper.status_error;
 	while (status_error & E1000_RXD_STAT_DD) {
 		/*
@@ -315,6 +335,14 @@ igb_rx(igb_rx_ring_t *rx_ring)
 		    (status_error & E1000_RXDEXT_STATERR_IPE));
 
 		pkt_len = current_rbd->wb.upper.length;
+
+		if ((poll_bytes != IGB_NO_POLL) &&
+		    ((pkt_len + total_bytes) > poll_bytes))
+			break;
+
+		IGB_DEBUG_STAT(rx_ring->stat_pkt_cnt);
+		total_bytes += pkt_len;
+
 		mp = NULL;
 		/*
 		 * For packets with length more than the copy threshold,
diff --git a/usr/src/uts/common/io/igb/igb_sw.h b/usr/src/uts/common/io/igb/igb_sw.h
index 457c929d1a..a69ba3bb77 100644
--- a/usr/src/uts/common/io/igb/igb_sw.h
+++ b/usr/src/uts/common/io/igb/igb_sw.h
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *	http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,15 +20,17 @@
  */
 
 /*
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
+ */
+
+/*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Use is subject to license terms.
  */
 
 #ifndef	_IGB_SW_H
 #define	_IGB_SW_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -48,7 +48,7 @@ extern "C" {
 #include <sys/modctl.h>
 #include <sys/errno.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/vlan.h>
 #include <sys/ddi.h>
@@ -88,6 +88,9 @@ extern "C" {
 #define	IGB_INTR_MSI			2
 #define	IGB_INTR_LEGACY			3
 
+#define	IGB_NO_POLL			-1
+#define	IGB_NO_FREE_SLOT		-1
+
 #define	MAX_NUM_UNICAST_ADDRESSES	E1000_RAR_ENTRIES
 #define	MAX_NUM_MULTICAST_ADDRESSES	256
 #define	MAX_NUM_EITR			10
@@ -97,10 +100,9 @@ extern "C" {
 /*
  * Maximum values for user configurable parameters
  */
-#define	MAX_TX_QUEUE_NUM		4
-#define	MAX_RX_QUEUE_NUM		4
 #define	MAX_TX_RING_SIZE		4096
 #define	MAX_RX_RING_SIZE		4096
+#define	MAX_RX_GROUP_NUM		4
 
 #define	MAX_MTU				9000
 #define	MAX_RX_LIMIT_PER_INTR		4096
@@ -119,10 +121,9 @@ extern "C" {
 /*
  * Minimum values for user configurable parameters
  */
-#define	MIN_TX_QUEUE_NUM		1
-#define	MIN_RX_QUEUE_NUM		1
 #define	MIN_TX_RING_SIZE		64
 #define	MIN_RX_RING_SIZE		64
+#define	MIN_RX_GROUP_NUM		1
 
 #define	MIN_MTU				ETHERMIN
 #define	MIN_RX_LIMIT_PER_INTR		16
@@ -140,10 +141,11 @@ extern "C" {
 /*
  * Default values for user configurable parameters
  */
-#define	DEFAULT_TX_QUEUE_NUM		1
-#define	DEFAULT_RX_QUEUE_NUM		1
+#define	DEFAULT_TX_QUEUE_NUM		4
+#define	DEFAULT_RX_QUEUE_NUM		4
 #define	DEFAULT_TX_RING_SIZE		512
 #define	DEFAULT_RX_RING_SIZE		512
+#define	DEFAULT_RX_GROUP_NUM		1
 
 #define	DEFAULT_MTU			ETHERMTU
 #define	DEFAULT_RX_LIMIT_PER_INTR	256
@@ -187,7 +189,6 @@ extern "C" {
 #define	ATTACH_PROGRESS_ENABLE_INTR	0x1000	/* DDI interrupts enabled */
 #define	ATTACH_PROGRESS_FMINIT		0x2000	/* FMA initialized */
 
-
 #define	PROP_ADV_AUTONEG_CAP		"adv_autoneg_cap"
 #define	PROP_ADV_1000FDX_CAP		"adv_1000fdx_cap"
 #define	PROP_ADV_1000HDX_CAP		"adv_1000hdx_cap"
@@ -197,10 +198,10 @@ extern "C" {
 #define	PROP_ADV_10HDX_CAP		"adv_10hdx_cap"
 #define	PROP_DEFAULT_MTU		"default_mtu"
 #define	PROP_FLOW_CONTROL		"flow_control"
-#define	PROP_TX_QUEUE_NUM		"tx_queue_number"
 #define	PROP_TX_RING_SIZE		"tx_ring_size"
-#define	PROP_RX_QUEUE_NUM		"rx_queue_number"
 #define	PROP_RX_RING_SIZE		"rx_ring_size"
+#define	PROP_MR_ENABLE			"mr_enable"
+#define	PROP_RX_GROUP_NUM		"rx_group_number"
 
 #define	PROP_INTR_FORCE			"intr_force"
 #define	PROP_TX_HCKSUM_ENABLE		"tx_hcksum_enable"
@@ -410,7 +411,7 @@ typedef union igb_ether_addr {
 	} reg;
 	struct {
 		uint8_t		set;
-		uint8_t		redundant;
+		uint8_t		group_index;
 		uint8_t		addr[ETHERADDRL];
 	} mac;
 } igb_ether_addr_t;
@@ -479,6 +480,7 @@ typedef struct rx_control_block {
  */
 typedef struct igb_tx_ring {
 	uint32_t		index;	/* Ring index */
+	uint32_t		intr_vector;	/* Interrupt vector index */
 
 	/*
 	 * Mutexes
@@ -538,13 +540,14 @@ typedef struct igb_tx_ring {
 	uint32_t		stat_fail_no_tcb;
 	uint32_t		stat_fail_dma_bind;
 	uint32_t		stat_reschedule;
+	uint32_t		stat_pkt_cnt;
 #endif
 
 	/*
 	 * Pointer to the igb struct
 	 */
 	struct igb		*igb;
-
+	mac_ring_handle_t	ring_handle;	/* call back ring handle */
 } igb_tx_ring_t;
 
 /*
@@ -592,12 +595,24 @@ typedef struct igb_rx_ring {
 	uint32_t		stat_frame_error;
 	uint32_t		stat_cksum_error;
 	uint32_t		stat_exceed_pkt;
+	uint32_t		stat_pkt_cnt;
 #endif
 
 	struct igb		*igb;		/* Pointer to igb struct */
-
+	mac_ring_handle_t	ring_handle; /* call back ring handle */
+	uint32_t		group_index;	/* group index */
+	uint64_t		ring_gen_num;
 } igb_rx_ring_t;
 
+/*
+ * Software Receive Ring Group
+ */
+typedef struct igb_rx_group {
+	uint32_t		index;		/* Group index */
+	mac_group_handle_t	group_handle;   /* call back group handle */
+	struct igb		*igb;		/* Pointer to igb struct */
+} igb_rx_group_t;
+
 typedef struct igb {
 	int 			instance;
 	mac_handle_t		mac_hdl;
@@ -616,13 +631,18 @@ typedef struct igb {
 	uint32_t		loopback_mode;
 	uint32_t		max_frame_size;
 
+	uint32_t		mr_enable;	/* Enable multiple rings */
+	uint32_t		vmdq_mode;	/* Mode of VMDq */
+
 	/*
-	 * Receive Rings
+	 * Receive Rings and Groups
 	 */
 	igb_rx_ring_t		*rx_rings;	/* Array of rx rings */
 	uint32_t		num_rx_rings;	/* Number of rx rings in use */
 	uint32_t		rx_ring_size;	/* Rx descriptor ring size */
 	uint32_t		rx_buf_size;	/* Rx buffer size */
+	igb_rx_group_t		*rx_groups;	/* Array of rx groups */
+	uint32_t		num_rx_groups;	/* Number of rx groups in use */
 
 	/*
 	 * Transmit Rings
@@ -652,6 +672,7 @@ typedef struct igb {
 	uint_t			intr_pri;
 	ddi_intr_handle_t	*htable;
 	uint32_t		eims_mask;
+	uint32_t		ims_mask;
 
 	kmutex_t		gen_lock; /* General lock for device access */
 	kmutex_t		watchdog_lock;
@@ -772,7 +793,8 @@ void igb_free_dma(igb_t *);
 int igb_start(igb_t *);
 void igb_stop(igb_t *);
 int igb_setup_link(igb_t *, boolean_t);
-int igb_unicst_set(igb_t *, const uint8_t *, mac_addr_slot_t);
+int igb_unicst_find(igb_t *, const uint8_t *);
+int igb_unicst_set(igb_t *, const uint8_t *, int);
 int igb_multicst_add(igb_t *, const uint8_t *);
 int igb_multicst_remove(igb_t *, const uint8_t *);
 enum ioc_reply igb_loopback_ioctl(igb_t *, struct iocblk *, mblk_t *);
@@ -795,22 +817,23 @@ int igb_m_unicst(void *, const uint8_t *);
 int igb_m_stat(void *, uint_t, uint64_t *);
 void igb_m_resources(void *);
 void igb_m_ioctl(void *, queue_t *, mblk_t *);
-int igb_m_unicst_add(void *, mac_multi_addr_t *);
-int igb_m_unicst_remove(void *, mac_addr_slot_t);
-int igb_m_unicst_modify(void *, mac_multi_addr_t *);
-int igb_m_unicst_get(void *, mac_multi_addr_t *);
 boolean_t igb_m_getcapab(void *, mac_capab_t, void *);
+void igb_fill_ring(void *, mac_ring_type_t, const int, const int,
+    mac_ring_info_t *, mac_ring_handle_t);
+void igb_fill_group(void *arg, mac_ring_type_t, const int,
+    mac_group_info_t *, mac_group_handle_t);
+int igb_rx_ring_intr_enable(mac_intr_handle_t);
+int igb_rx_ring_intr_disable(mac_intr_handle_t);
 
 /*
  * Function prototypes in igb_rx.c
  */
-mblk_t *igb_rx(igb_rx_ring_t *);
+mblk_t *igb_rx(igb_rx_ring_t *, int);
 void igb_rx_recycle(caddr_t arg);
 
 /*
  * Function prototypes in igb_tx.c
  */
-mblk_t *igb_m_tx(void *, mblk_t *);
 void igb_free_tcb(tx_control_block_t *);
 void igb_put_free_list(igb_tx_ring_t *, link_list_t *);
 uint32_t igb_tx_recycle_legacy(igb_tx_ring_t *);
@@ -835,6 +858,8 @@ enum ioc_reply igb_nd_ioctl(igb_t *, queue_t *, mblk_t *, struct iocblk *);
  */
 int igb_init_stats(igb_t *);
 
+mblk_t *igb_rx_ring_poll(void *, int);
+mblk_t *igb_tx_ring_send(void *, mblk_t *);
 
 #ifdef __cplusplus
 }
diff --git a/usr/src/uts/common/io/igb/igb_tx.c b/usr/src/uts/common/io/igb/igb_tx.c
index b3a0090ebe..7b43bbad97 100644
--- a/usr/src/uts/common/io/igb/igb_tx.c
+++ b/usr/src/uts/common/io/igb/igb_tx.c
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *	http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
 
 #include "igb_sw.h"
 
@@ -42,7 +42,7 @@ static tx_control_block_t *igb_get_free_list(igb_tx_ring_t *);
 static void igb_get_hcksum_context(mblk_t *, hcksum_context_t *);
 static boolean_t igb_check_hcksum_context(igb_tx_ring_t *, hcksum_context_t *);
 static void igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *,
-    hcksum_context_t *);
+    hcksum_context_t *, uint32_t);
 
 #ifndef IGB_DEBUG
 #pragma inline(igb_save_desc)
@@ -51,58 +51,14 @@ static void igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *,
 #pragma inline(igb_fill_hcksum_context)
 #endif
 
-/*
- * igb_m_tx
- *
- * The GLDv3 interface to call driver's tx routine to transmit
- * the mblks.
- */
 mblk_t *
-igb_m_tx(void *arg, mblk_t *mp)
+igb_tx_ring_send(void *arg, mblk_t *mp)
 {
-	igb_t *igb = (igb_t *)arg;
-	mblk_t *next;
-	igb_tx_ring_t *tx_ring;
+	igb_tx_ring_t *tx_ring = (igb_tx_ring_t *)arg;
 
-	/*
-	 * If the adapter is suspended, or it is not started, or the link
-	 * is not up, the mblks are simply dropped.
-	 */
-	if (((igb->igb_state & IGB_SUSPENDED) != 0) ||
-	    ((igb->igb_state & IGB_STARTED) == 0) ||
-	    (igb->link_state != LINK_STATE_UP)) {
-		/* Free the mblk chain */
-		while (mp != NULL) {
-			next = mp->b_next;
-			mp->b_next = NULL;
-
-			freemsg(mp);
-			mp = next;
-		}
+	ASSERT(tx_ring != NULL);
 
-		return (NULL);
-	}
-
-	/*
-	 * Decide which tx ring is used to transmit the packets.
-	 * This needs to be updated later to fit the new interface
-	 * of the multiple rings support.
-	 */
-	tx_ring = &igb->tx_rings[0];
-
-	while (mp != NULL) {
-		next = mp->b_next;
-		mp->b_next = NULL;
-
-		if (!igb_tx(tx_ring, mp)) {
-			mp->b_next = next;
-			break;
-		}
-
-		mp = next;
-	}
-
-	return (mp);
+	return ((igb_tx(tx_ring, mp)) ? NULL : mp);
 }
 
 /*
@@ -671,7 +627,7 @@ igb_check_hcksum_context(igb_tx_ring_t *tx_ring, hcksum_context_t *hcksum)
  */
 static void
 igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *ctx_tbd,
-    hcksum_context_t *hcksum)
+    hcksum_context_t *hcksum, uint32_t ring_index)
 {
 	/*
 	 * Fill the context descriptor with the checksum
@@ -708,7 +664,7 @@ igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *ctx_tbd,
 	}
 
 	ctx_tbd->seqnum_seed = 0;
-	ctx_tbd->mss_l4len_idx = 0;
+	ctx_tbd->mss_l4len_idx = ring_index << 4;
 }
 
 /*
@@ -764,7 +720,8 @@ igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list,
 			 * hardware checksum offload informations.
 			 */
 			igb_fill_hcksum_context(
-			    (struct e1000_adv_tx_context_desc *)tbd, hcksum);
+			    (struct e1000_adv_tx_context_desc *)tbd, hcksum,
+			    tx_ring->index);
 
 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
 			desc_num++;
@@ -843,6 +800,7 @@ igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list,
 		if (hcksum_flags & HCK_PARTIALCKSUM)
 			first_tbd->read.olinfo_status |=
 			    E1000_TXD_POPTS_TXSM << 8;
+		first_tbd->read.olinfo_status |= tx_ring->index << 4;
 	}
 
 	/*
@@ -853,6 +811,8 @@ igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list,
 	tbd->read.cmd_type_len |=
 	    E1000_ADVTXD_DCMD_EOP | E1000_ADVTXD_DCMD_RS;
 
+	IGB_DEBUG_STAT(tx_ring->stat_pkt_cnt);
+
 	/*
 	 * Sync the DMA buffer of the tx descriptor ring
 	 */
diff --git a/usr/src/uts/common/io/ipw/ipw2100.c b/usr/src/uts/common/io/ipw/ipw2100.c
index 3ad59d1051..d1171b5122 100644
--- a/usr/src/uts/common/io/ipw/ipw2100.c
+++ b/usr/src/uts/common/io/ipw/ipw2100.c
@@ -48,7 +48,7 @@
 #include <sys/modctl.h>
 #include <sys/devops.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <net/if.h>
 #include <sys/mac_wifi.h>
 #include <sys/varargs.h>
@@ -177,7 +177,6 @@ mac_callbacks_t	ipw2100_m_callbacks = {
 	ipw2100_m_multicst,
 	ipw2100_m_unicst,
 	ipw2100_m_tx,
-	NULL,
 	ipw2100_m_ioctl
 };
 
diff --git a/usr/src/uts/common/io/iwh/iwh.c b/usr/src/uts/common/io/iwh/iwh.c
index cce2a98845..1865a7ee5c 100644
--- a/usr/src/uts/common/io/iwh/iwh.c
+++ b/usr/src/uts/common/io/iwh/iwh.c
@@ -48,7 +48,7 @@
 #include <sys/modctl.h>
 #include <sys/devops.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_wifi.h>
 #include <sys/net80211.h>
 #include <sys/net80211_proto.h>
@@ -414,7 +414,6 @@ mac_callbacks_t	iwh_m_callbacks = {
 	iwh_m_multicst,
 	iwh_m_unicst,
 	iwh_m_tx,
-	NULL,
 	iwh_m_ioctl
 };
 
diff --git a/usr/src/uts/common/io/iwi/ipw2200.c b/usr/src/uts/common/io/iwi/ipw2200.c
index 465c3ea2a7..80633d498f 100644
--- a/usr/src/uts/common/io/iwi/ipw2200.c
+++ b/usr/src/uts/common/io/iwi/ipw2200.c
@@ -48,7 +48,7 @@
 #include <sys/modctl.h>
 #include <sys/devops.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_wifi.h>
 #include <sys/varargs.h>
 #include <sys/pci.h>
@@ -207,7 +207,6 @@ mac_callbacks_t	ipw2200_m_callbacks = {
 	ipw2200_m_multicst,
 	ipw2200_m_unicst,
 	ipw2200_m_tx,
-	NULL,
 	ipw2200_m_ioctl
 };
 
diff --git a/usr/src/uts/common/io/iwk/iwk2.c b/usr/src/uts/common/io/iwk/iwk2.c
index a0f17f2927..4ec4b774c8 100644
--- a/usr/src/uts/common/io/iwk/iwk2.c
+++ b/usr/src/uts/common/io/iwk/iwk2.c
@@ -48,7 +48,7 @@
 #include <sys/modctl.h>
 #include <sys/devops.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_wifi.h>
 #include <sys/net80211.h>
 #include <sys/net80211_proto.h>
@@ -423,7 +423,6 @@ mac_callbacks_t	iwk_m_callbacks = {
 	iwk_m_multicst,
 	iwk_m_unicst,
 	iwk_m_tx,
-	NULL,
 	iwk_m_ioctl,
 	NULL,
 	NULL,
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe.conf b/usr/src/uts/common/io/ixgbe/ixgbe.conf
index 0e46fe5a0d..215d3d9516 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe.conf
+++ b/usr/src/uts/common/io/ixgbe/ixgbe.conf
@@ -1,19 +1,17 @@
 #
 # CDDL HEADER START
 #
-# Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
 # The contents of this file are subject to the terms of the
 # Common Development and Distribution License (the "License").
 # You may not use this file except in compliance with the License.
 #
-# You can obtain a copy of the license at:
-#       http://www.opensolaris.org/os/licensing.
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
 # See the License for the specific language governing permissions
 # and limitations under the License.
 #
-# When using or redistributing this file, you may do so under the
-# License only. No other modification of this header is permitted.
-#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 # If applicable, add the following below this CDDL HEADER, with the
 # fields enclosed by brackets "[]" replaced with your own identifying
 # information: Portions Copyright [yyyy] [name of copyright owner]
@@ -21,11 +19,10 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms of the CDDL.
+# Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
 #
-# 
-# ident	"%Z%%M%	%I%	%E% SMI"
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
 #
 #
 # Driver.conf file for Intel 10GbE PCIE NIC Driver (ixgbe)
@@ -45,35 +42,31 @@
 #			1 - Receive only
 #			2 - Transmit only
 #			3 - Receive and transmit
-#	default value:	3
+#	default value:	0
 #
 # flow_control		= 3;
 #
 # -------------------- Transmit/Receive Queues --------------------
-# tx/rx queue.
-# tx_queue_number
-#	The number of the transmit queues
-#	Allowed values:	1 - 32
-#	Default value:	1
 #
 # tx_ring_size
 #	The number of the transmit descriptors per transmit queue
 #	Allowed values:	64 - 4096
-#	Default value:	512
-#
-# rx_queue_number
-#	The number of the receive queues
-#	Allowed values:	1 - 64
-#	Default value:	1
+#	Default value:	1024
 #
 # rx_ring_size
 #	The number of the receive descriptors per receive queue
 #	Allowed values:	64 - 4096
-#	Default value:	512
+#	Default value:	1024
 #
-# Note: The final values of tx_queue_number and rx_queue_number are decided
-# by the number of interrupt vectors obtained by the driver. They could be
-# less than the specified values because of limited interrupt vector number.
+# mr_enable
+#	Enable multiple tx queues and rx queues
+#	Allowed values: 0 - 1
+#	Default value:	1
+#
+# rx_group_number
+#	The number of the receive groups
+#	Allowed values:	1 - 16
+#	Default value:	1
 #
 # -------- How to set parameters for a particular interface ---------
 # The example below shows how to locate the device path and set a parameter
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_common.c b/usr/src/uts/common/io/ixgbe/ixgbe_common.c
index f472cbd290..76e0232ff7 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_common.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_common.c
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *      http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,14 +20,16 @@
  */
 
 /*
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
+ */
+
+/*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Use is subject to license terms.
  */
 
 /* IntelVersion: 1.159 v2008-03-04 */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include "ixgbe_common.h"
 #include "ixgbe_api.h"
 
@@ -1546,27 +1546,11 @@ ixgbe_set_mta(struct ixgbe_hw *hw, u8 *mc_addr)
 void
 ixgbe_add_mc_addr(struct ixgbe_hw *hw, u8 *mc_addr)
 {
-	u32 rar_entries = hw->mac.num_rar_entries;
-	u32 rar;
-
 	DEBUGOUT6(" MC Addr =%.2X %.2X %.2X %.2X %.2X %.2X\n",
 	    mc_addr[0], mc_addr[1], mc_addr[2],
 	    mc_addr[3], mc_addr[4], mc_addr[5]);
 
-	/*
-	 * Place this multicast address in the RAR if there is room,
-	 * else put it in the MTA
-	 */
-	if (hw->addr_ctrl.rar_used_count < rar_entries) {
-		/* use RAR from the end up for multicast */
-		rar = rar_entries - hw->addr_ctrl.mc_addr_in_rar_count - 1;
-		hw->mac.ops.set_rar(hw, rar, mc_addr, 0, IXGBE_RAH_AV);
-		DEBUGOUT1("Added a multicast address to RAR[%d]\n", rar);
-		hw->addr_ctrl.rar_used_count++;
-		hw->addr_ctrl.mc_addr_in_rar_count++;
-	} else {
-		ixgbe_set_mta(hw, mc_addr);
-	}
+	ixgbe_set_mta(hw, mc_addr);
 
 	DEBUGOUT("ixgbe_add_mc_addr Complete\n");
 }
@@ -1588,7 +1572,6 @@ ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list,
     u32 mc_addr_count, ixgbe_mc_addr_itr next)
 {
 	u32 i;
-	u32 rar_entries = hw->mac.num_rar_entries;
 	u32 vmdq;
 
 	/*
@@ -1596,18 +1579,8 @@ ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list,
 	 * use.
 	 */
 	hw->addr_ctrl.num_mc_addrs = mc_addr_count;
-	hw->addr_ctrl.rar_used_count -= hw->addr_ctrl.mc_addr_in_rar_count;
-	hw->addr_ctrl.mc_addr_in_rar_count = 0;
 	hw->addr_ctrl.mta_in_use = 0;
 
-	/* Zero out the other receive addresses. */
-	DEBUGOUT2("Clearing RAR[%d-%d]\n", hw->addr_ctrl.rar_used_count,
-	    rar_entries - 1);
-	for (i = hw->addr_ctrl.rar_used_count; i < rar_entries; i++) {
-		IXGBE_WRITE_REG(hw, IXGBE_RAL(i), 0);
-		IXGBE_WRITE_REG(hw, IXGBE_RAH(i), 0);
-	}
-
 	/* Clear the MTA */
 	DEBUGOUT(" Clearing MTA\n");
 	for (i = 0; i < hw->mac.mcft_size; i++)
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_gld.c b/usr/src/uts/common/io/ixgbe/ixgbe_gld.c
index 78a96bd4ef..b4b3a966fe 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_gld.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_gld.c
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *      http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
 
 #include "ixgbe_sw.h"
 
@@ -103,16 +103,24 @@ ixgbe_m_stat(void *arg, uint_t stat, uint64_t *val)
 		break;
 
 	case MAC_STAT_RBYTES:
-		for (i = 0; i < 16; i++)
-			ixgbe_ks->tor.value.ui64 +=
+		ixgbe_ks->tor.value.ui64 = 0;
+		for (i = 0; i < 16; i++) {
+			ixgbe_ks->qbrc[i].value.ui64 +=
 			    IXGBE_READ_REG(hw, IXGBE_QBRC(i));
+			ixgbe_ks->tor.value.ui64 +=
+			    ixgbe_ks->qbrc[i].value.ui64;
+		}
 		*val = ixgbe_ks->tor.value.ui64;
 		break;
 
 	case MAC_STAT_OBYTES:
-		for (i = 0; i < 16; i++)
-			ixgbe_ks->tot.value.ui64 +=
+		ixgbe_ks->tot.value.ui64 = 0;
+		for (i = 0; i < 16; i++) {
+			ixgbe_ks->qbtc[i].value.ui64 +=
 			    IXGBE_READ_REG(hw, IXGBE_QBTC(i));
+			ixgbe_ks->tot.value.ui64 +=
+			    ixgbe_ks->qbtc[i].value.ui64;
+		}
 		*val = ixgbe_ks->tot.value.ui64;
 		break;
 
@@ -412,37 +420,6 @@ ixgbe_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr)
 }
 
 /*
- * Set a new device unicast address.
- */
-int
-ixgbe_m_unicst(void *arg, const uint8_t *mac_addr)
-{
-	ixgbe_t *ixgbe = (ixgbe_t *)arg;
-	int result;
-
-	mutex_enter(&ixgbe->gen_lock);
-
-	if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
-		mutex_exit(&ixgbe->gen_lock);
-		return (ECANCELED);
-	}
-
-	/*
-	 * Store the new MAC address.
-	 */
-	bcopy(mac_addr, ixgbe->hw.mac.addr, ETHERADDRL);
-
-	/*
-	 * Set MAC address in address slot 0, which is the default address.
-	 */
-	result = ixgbe_unicst_set(ixgbe, mac_addr, 0);
-
-	mutex_exit(&ixgbe->gen_lock);
-
-	return (result);
-}
-
-/*
  * Pass on M_IOCTL messages passed to the DLD, and support
  * private IOCTLs for debugging and ndd.
  */
@@ -511,191 +488,6 @@ ixgbe_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
 	}
 }
 
-
-/*
- * Find an unused address slot, set the address to it, reserve
- * this slot and enable the device to start filtering on the
- * new address.
- */
-int
-ixgbe_m_unicst_add(void *arg, mac_multi_addr_t *maddr)
-{
-	ixgbe_t *ixgbe = (ixgbe_t *)arg;
-	mac_addr_slot_t slot;
-	int err;
-
-	mutex_enter(&ixgbe->gen_lock);
-
-	if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
-		mutex_exit(&ixgbe->gen_lock);
-		return (ECANCELED);
-	}
-
-	if (mac_unicst_verify(ixgbe->mac_hdl,
-	    maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) {
-		mutex_exit(&ixgbe->gen_lock);
-		return (EINVAL);
-	}
-
-	if (ixgbe->unicst_avail == 0) {
-		/* no slots available */
-		mutex_exit(&ixgbe->gen_lock);
-		return (ENOSPC);
-	}
-
-	/*
-	 * Primary/default address is in slot 0. The next addresses
-	 * are the multiple MAC addresses. So multiple MAC address 0
-	 * is in slot 1, 1 in slot 2, and so on. So the first multiple
-	 * MAC address resides in slot 1.
-	 */
-	for (slot = 1; slot < ixgbe->unicst_total; slot++) {
-		if (ixgbe->unicst_addr[slot].mac.set == 0)
-			break;
-	}
-
-	ASSERT((slot > 0) && (slot < ixgbe->unicst_total));
-
-	maddr->mma_slot = slot;
-
-	if ((err = ixgbe_unicst_set(ixgbe, maddr->mma_addr, slot)) == 0) {
-		ixgbe->unicst_addr[slot].mac.set = 1;
-		ixgbe->unicst_avail--;
-	}
-
-	mutex_exit(&ixgbe->gen_lock);
-
-	return (err);
-}
-
-/*
- * Removes a MAC address that was added before.
- */
-int
-ixgbe_m_unicst_remove(void *arg, mac_addr_slot_t slot)
-{
-	ixgbe_t *ixgbe = (ixgbe_t *)arg;
-	int err;
-
-	mutex_enter(&ixgbe->gen_lock);
-
-	if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
-		mutex_exit(&ixgbe->gen_lock);
-		return (ECANCELED);
-	}
-
-	if ((slot <= 0) || (slot >= ixgbe->unicst_total)) {
-		mutex_exit(&ixgbe->gen_lock);
-		return (EINVAL);
-	}
-
-	if (ixgbe->unicst_addr[slot].mac.set == 1) {
-		/*
-		 * Copy the default address to the passed slot
-		 */
-		if ((err = ixgbe_unicst_set(ixgbe,
-		    ixgbe->unicst_addr[0].mac.addr, slot)) == 0) {
-			ixgbe->unicst_addr[slot].mac.set = 0;
-			ixgbe->unicst_avail++;
-		}
-
-		mutex_exit(&ixgbe->gen_lock);
-
-		return (err);
-	}
-
-	mutex_exit(&ixgbe->gen_lock);
-
-	return (EINVAL);
-}
-
-/*
- * Modifies the value of an address that has been added before.
- * The new address length and the slot number that was returned
- * in the call to add should be passed in. mma_flags should be
- * set to 0.
- * Returns 0 on success.
- */
-int
-ixgbe_m_unicst_modify(void *arg, mac_multi_addr_t *maddr)
-{
-	ixgbe_t *ixgbe = (ixgbe_t *)arg;
-	mac_addr_slot_t slot;
-	int err;
-
-	mutex_enter(&ixgbe->gen_lock);
-
-	if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
-		mutex_exit(&ixgbe->gen_lock);
-		return (ECANCELED);
-	}
-
-	if (mac_unicst_verify(ixgbe->mac_hdl,
-	    maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) {
-		mutex_exit(&ixgbe->gen_lock);
-		return (EINVAL);
-	}
-
-	slot = maddr->mma_slot;
-
-	if ((slot <= 0) || (slot >= ixgbe->unicst_total)) {
-		mutex_exit(&ixgbe->gen_lock);
-		return (EINVAL);
-	}
-
-	if (ixgbe->unicst_addr[slot].mac.set == 1) {
-		err = ixgbe_unicst_set(ixgbe, maddr->mma_addr, slot);
-		mutex_exit(&ixgbe->gen_lock);
-		return (err);
-	}
-
-	mutex_exit(&ixgbe->gen_lock);
-
-	return (EINVAL);
-}
-
-/*
- * Get the MAC address and all other information related to
- * the address slot passed in mac_multi_addr_t.
- * mma_flags should be set to 0 in the call.
- * On return, mma_flags can take the following values:
- * 1) MMAC_SLOT_UNUSED
- * 2) MMAC_SLOT_USED | MMAC_VENDOR_ADDR
- * 3) MMAC_SLOT_UNUSED | MMAC_VENDOR_ADDR
- * 4) MMAC_SLOT_USED
- */
-int
-ixgbe_m_unicst_get(void *arg, mac_multi_addr_t *maddr)
-{
-	ixgbe_t *ixgbe = (ixgbe_t *)arg;
-	mac_addr_slot_t slot;
-
-	mutex_enter(&ixgbe->gen_lock);
-
-	if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
-		mutex_exit(&ixgbe->gen_lock);
-		return (ECANCELED);
-	}
-
-	slot = maddr->mma_slot;
-
-	if ((slot <= 0) || (slot >= ixgbe->unicst_total)) {
-		mutex_exit(&ixgbe->gen_lock);
-		return (EINVAL);
-	}
-	if (ixgbe->unicst_addr[slot].mac.set == 1) {
-		bcopy(ixgbe->unicst_addr[slot].mac.addr,
-		    maddr->mma_addr, ETHERADDRL);
-		maddr->mma_flags = MMAC_SLOT_USED;
-	} else {
-		maddr->mma_flags = MMAC_SLOT_UNUSED;
-	}
-
-	mutex_exit(&ixgbe->gen_lock);
-
-	return (0);
-}
-
 /*
  * Obtain the MAC's capabilities and associated data from
  * the driver.
@@ -732,25 +524,29 @@ ixgbe_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 			return (B_FALSE);
 		}
 	}
-	case MAC_CAPAB_MULTIADDRESS: {
-		multiaddress_capab_t *mmacp = cap_data;
-
-		/*
-		 * The number of MAC addresses made available by
-		 * this capability is one less than the total as
-		 * the primary address in slot 0 is counted in
-		 * the total.
-		 */
-		mmacp->maddr_naddr = ixgbe->unicst_total - 1;
-		mmacp->maddr_naddrfree = ixgbe->unicst_avail;
-		/* No multiple factory addresses, set mma_flag to 0 */
-		mmacp->maddr_flag = 0;
-		mmacp->maddr_handle = ixgbe;
-		mmacp->maddr_add = ixgbe_m_unicst_add;
-		mmacp->maddr_remove = ixgbe_m_unicst_remove;
-		mmacp->maddr_modify = ixgbe_m_unicst_modify;
-		mmacp->maddr_get = ixgbe_m_unicst_get;
-		mmacp->maddr_reserve = NULL;
+	case MAC_CAPAB_RINGS: {
+		mac_capab_rings_t *cap_rings = cap_data;
+
+		switch (cap_rings->mr_type) {
+		case MAC_RING_TYPE_RX:
+			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+			cap_rings->mr_rnum = ixgbe->num_rx_rings;
+			cap_rings->mr_gnum = ixgbe->num_rx_groups;
+			cap_rings->mr_rget = ixgbe_fill_ring;
+			cap_rings->mr_gget = ixgbe_fill_group;
+			cap_rings->mr_gaddring = NULL;
+			cap_rings->mr_gremring = NULL;
+			break;
+		case MAC_RING_TYPE_TX:
+			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+			cap_rings->mr_rnum = ixgbe->num_tx_rings;
+			cap_rings->mr_gnum = 0;
+			cap_rings->mr_rget = ixgbe_fill_ring;
+			cap_rings->mr_gget = NULL;
+			break;
+		default:
+			break;
+		}
 		break;
 	}
 	default:
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_main.c b/usr/src/uts/common/io/ixgbe/ixgbe_main.c
index f7bbcb1ff6..f8acd5fdd5 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_main.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_main.c
@@ -61,6 +61,8 @@ static void ixgbe_setup_rx_ring(ixgbe_rx_ring_t *);
 static void ixgbe_setup_tx_ring(ixgbe_tx_ring_t *);
 static void ixgbe_setup_rss(ixgbe_t *);
 static void ixgbe_init_unicst(ixgbe_t *);
+static int ixgbe_unicst_set(ixgbe_t *, const uint8_t *, int);
+static int ixgbe_unicst_find(ixgbe_t *, const uint8_t *);
 static void ixgbe_setup_multicst(ixgbe_t *);
 static void ixgbe_get_hw_state(ixgbe_t *);
 static void ixgbe_get_conf(ixgbe_t *);
@@ -83,7 +85,9 @@ static int ixgbe_alloc_intr_handles(ixgbe_t *, int);
 static int ixgbe_add_intr_handlers(ixgbe_t *);
 static void ixgbe_map_rxring_to_vector(ixgbe_t *, int, int);
 static void ixgbe_map_txring_to_vector(ixgbe_t *, int, int);
-static void ixgbe_set_ivar(ixgbe_t *, uint16_t, uint8_t);
+static void ixgbe_setup_ivar(ixgbe_t *, uint16_t, uint8_t);
+static void ixgbe_enable_ivar(ixgbe_t *, uint16_t);
+static void ixgbe_disable_ivar(ixgbe_t *, uint16_t);
 static int ixgbe_map_rings_to_vectors(ixgbe_t *);
 static void ixgbe_setup_adapter_vector(ixgbe_t *);
 static void ixgbe_rem_intr_handlers(ixgbe_t *);
@@ -92,12 +96,14 @@ static int ixgbe_enable_intrs(ixgbe_t *);
 static int ixgbe_disable_intrs(ixgbe_t *);
 static uint_t ixgbe_intr_legacy(void *, void *);
 static uint_t ixgbe_intr_msi(void *, void *);
-static uint_t ixgbe_intr_rx(void *, void *);
-static uint_t ixgbe_intr_tx_other(void *, void *);
+static uint_t ixgbe_intr_rx_tx(void *, void *);
+static uint_t ixgbe_intr_other(void *, void *);
 static void ixgbe_intr_rx_work(ixgbe_rx_ring_t *);
 static void ixgbe_intr_tx_work(ixgbe_tx_ring_t *);
 static void ixgbe_intr_other_work(ixgbe_t *);
 static void ixgbe_get_driver_control(struct ixgbe_hw *);
+static int ixgbe_addmac(void *, const uint8_t *);
+static int ixgbe_remmac(void *, const uint8_t *);
 static void ixgbe_release_driver_control(struct ixgbe_hw *);
 
 static int ixgbe_attach(dev_info_t *, ddi_attach_cmd_t);
@@ -188,8 +194,7 @@ static mac_callbacks_t ixgbe_m_callbacks = {
 	ixgbe_m_stop,
 	ixgbe_m_promisc,
 	ixgbe_m_multicst,
-	ixgbe_m_unicst,
-	ixgbe_m_tx,
+	NULL,
 	NULL,
 	ixgbe_m_ioctl,
 	ixgbe_m_getcapab
@@ -675,6 +680,7 @@ ixgbe_register_mac(ixgbe_t *ixgbe)
 	mac->m_min_sdu = 0;
 	mac->m_max_sdu = ixgbe->default_mtu;
 	mac->m_margin = VLAN_TAGSZ;
+	mac->m_v12n = MAC_VIRT_LEVEL1;
 
 	status = mac_register(mac, &ixgbe->mac_hdl);
 
@@ -765,6 +771,7 @@ static int
 ixgbe_init_driver_settings(ixgbe_t *ixgbe)
 {
 	struct ixgbe_hw *hw = &ixgbe->hw;
+	dev_info_t *devinfo = ixgbe->dip;
 	ixgbe_rx_ring_t *rx_ring;
 	ixgbe_tx_ring_t *tx_ring;
 	uint32_t rx_size;
@@ -779,6 +786,11 @@ ixgbe_init_driver_settings(ixgbe_t *ixgbe)
 	}
 
 	/*
+	 * Get the system page size
+	 */
+	ixgbe->sys_page_size = ddi_ptob(devinfo, (ulong_t)1);
+
+	/*
 	 * Set rx buffer size
 	 *
 	 * The IP header alignment room is counted in the calculation.
@@ -1569,6 +1581,23 @@ ixgbe_alloc_rings(ixgbe_t *ixgbe)
 		return (IXGBE_FAILURE);
 	}
 
+	/*
+	 * Allocate memory space for rx ring groups
+	 */
+	ixgbe->rx_groups = kmem_zalloc(
+	    sizeof (ixgbe_rx_group_t) * ixgbe->num_rx_groups,
+	    KM_NOSLEEP);
+
+	if (ixgbe->rx_groups == NULL) {
+		kmem_free(ixgbe->rx_rings,
+		    sizeof (ixgbe_rx_ring_t) * ixgbe->num_rx_rings);
+		kmem_free(ixgbe->tx_rings,
+		    sizeof (ixgbe_tx_ring_t) * ixgbe->num_tx_rings);
+		ixgbe->rx_rings = NULL;
+		ixgbe->tx_rings = NULL;
+		return (IXGBE_FAILURE);
+	}
+
 	return (IXGBE_SUCCESS);
 }
 
@@ -1589,6 +1618,12 @@ ixgbe_free_rings(ixgbe_t *ixgbe)
 		    sizeof (ixgbe_tx_ring_t) * ixgbe->num_tx_rings);
 		ixgbe->tx_rings = NULL;
 	}
+
+	if (ixgbe->rx_groups != NULL) {
+		kmem_free(ixgbe->rx_groups,
+		    sizeof (ixgbe_rx_group_t) * ixgbe->num_rx_groups);
+		ixgbe->rx_groups = NULL;
+	}
 }
 
 /*
@@ -1693,7 +1728,9 @@ ixgbe_setup_rx(ixgbe_t *ixgbe)
 {
 	ixgbe_rx_ring_t *rx_ring;
 	struct ixgbe_hw *hw = &ixgbe->hw;
+	ixgbe_rx_group_t *rx_group;
 	uint32_t reg_val;
+	uint32_t ring_mapping;
 	int i;
 
 	/*
@@ -1723,6 +1760,29 @@ ixgbe_setup_rx(ixgbe_t *ixgbe)
 	}
 
 	/*
+	 * Setup rx groups.
+	 */
+	for (i = 0; i < ixgbe->num_rx_groups; i++) {
+		rx_group = &ixgbe->rx_groups[i];
+		rx_group->index = i;
+		rx_group->ixgbe = ixgbe;
+	}
+
+	/*
+	 * Setup the per-ring statistics mapping.
+	 */
+	ring_mapping = 0;
+	for (i = 0; i < ixgbe->num_rx_rings; i++) {
+		ring_mapping |= (i & 0xF) << (8 * (i & 0x3));
+		if ((i & 0x3) == 0x3) {
+			IXGBE_WRITE_REG(hw, IXGBE_RQSMR(i >> 2), ring_mapping);
+			ring_mapping = 0;
+		}
+	}
+	if ((i & 0x3) != 0x3)
+		IXGBE_WRITE_REG(hw, IXGBE_RQSMR(i >> 2), ring_mapping);
+
+	/*
 	 * The Max Frame Size in MHADD will be internally increased by four
 	 * bytes if the packet has a VLAN field, so includes MTU, ethernet
 	 * header and frame check sequence.
@@ -1858,6 +1918,7 @@ ixgbe_setup_tx(ixgbe_t *ixgbe)
 	struct ixgbe_hw *hw = &ixgbe->hw;
 	ixgbe_tx_ring_t *tx_ring;
 	uint32_t reg_val;
+	uint32_t ring_mapping;
 	int i;
 
 	for (i = 0; i < ixgbe->num_tx_rings; i++) {
@@ -1866,6 +1927,20 @@ ixgbe_setup_tx(ixgbe_t *ixgbe)
 	}
 
 	/*
+	 * Setup the per-ring statistics mapping.
+	 */
+	ring_mapping = 0;
+	for (i = 0; i < ixgbe->num_tx_rings; i++) {
+		ring_mapping |= (i & 0xF) << (8 * (i & 0x3));
+		if ((i & 0x3) == 0x3) {
+			IXGBE_WRITE_REG(hw, IXGBE_TQSMR(i >> 2), ring_mapping);
+			ring_mapping = 0;
+		}
+	}
+	if ((i & 0x3) != 0x3)
+		IXGBE_WRITE_REG(hw, IXGBE_TQSMR(i >> 2), ring_mapping);
+
+	/*
 	 * Enable CRC appending and TX padding (for short tx frames)
 	 */
 	reg_val = IXGBE_READ_REG(hw, IXGBE_HLREG0);
@@ -1936,13 +2011,13 @@ static void
 ixgbe_init_unicst(ixgbe_t *ixgbe)
 {
 	struct ixgbe_hw *hw = &ixgbe->hw;
+	uint8_t *mac_addr;
 	int slot;
 	/*
 	 * Here we should consider two situations:
 	 *
-	 * 1. Chipset is initialized the first time
-	 *    Initialize the multiple unicast addresses, and
-	 *    save the default mac address.
+	 * 1. Chipset is initialized at the first time,
+	 *    Clear all the multiple unicast addresses.
 	 *
 	 * 2. Chipset is reset
 	 *    Recover the multiple unicast addresses from the
@@ -1953,36 +2028,36 @@ ixgbe_init_unicst(ixgbe_t *ixgbe)
 		 * Initialize the multiple unicast addresses
 		 */
 		ixgbe->unicst_total = MAX_NUM_UNICAST_ADDRESSES;
-
-		ixgbe->unicst_avail = ixgbe->unicst_total - 1;
-
-		bcopy(hw->mac.addr, ixgbe->unicst_addr[0].mac.addr,
-		    ETHERADDRL);
-		ixgbe->unicst_addr[0].mac.set = 1;
-
-		for (slot = 1; slot < ixgbe->unicst_total; slot++)
+		ixgbe->unicst_avail = ixgbe->unicst_total;
+		for (slot = 0; slot < ixgbe->unicst_total; slot++) {
+			mac_addr = ixgbe->unicst_addr[slot].mac.addr;
+			bzero(mac_addr, ETHERADDRL);
+			(void) ixgbe_set_rar(hw, slot, mac_addr, NULL, NULL);
 			ixgbe->unicst_addr[slot].mac.set = 0;
-
+		}
 		ixgbe->unicst_init = B_TRUE;
 	} else {
-		/*
-		 * Recover the default mac address
-		 */
-		bcopy(ixgbe->unicst_addr[0].mac.addr, hw->mac.addr,
-		    ETHERADDRL);
-
 		/* Re-configure the RAR registers */
-		for (slot = 1; slot < ixgbe->unicst_total; slot++)
-			(void) ixgbe_set_rar(hw, slot,
-			    ixgbe->unicst_addr[slot].mac.addr, NULL, NULL);
+		for (slot = 0; slot < ixgbe->unicst_total; slot++) {
+			mac_addr = ixgbe->unicst_addr[slot].mac.addr;
+			if (ixgbe->unicst_addr[slot].mac.set == 1) {
+				(void) ixgbe_set_rar(hw, slot, mac_addr,
+				    NULL, IXGBE_RAH_AV);
+			} else {
+				bzero(mac_addr, ETHERADDRL);
+				(void) ixgbe_set_rar(hw, slot, mac_addr,
+				    NULL, NULL);
+			}
+		}
 	}
 }
+
 /*
  * ixgbe_unicst_set - Set the unicast address to the specified slot.
  */
 int
 ixgbe_unicst_set(ixgbe_t *ixgbe, const uint8_t *mac_addr,
-    mac_addr_slot_t slot)
+    int slot)
 {
 	struct ixgbe_hw *hw = &ixgbe->hw;
 
@@ -1996,7 +2071,7 @@ ixgbe_unicst_set(ixgbe_t *ixgbe, const uint8_t *mac_addr,
 	/*
 	 * Set the unicast address to the RAR register
 	 */
-	(void) ixgbe_set_rar(hw, slot, (uint8_t *)mac_addr, NULL, NULL);
+	(void) ixgbe_set_rar(hw, slot, (uint8_t *)mac_addr, NULL, IXGBE_RAH_AV);
 
 	if (ixgbe_check_acc_handle(ixgbe->osdep.reg_handle) != DDI_FM_OK) {
 		ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
@@ -2007,6 +2082,25 @@ ixgbe_unicst_set(ixgbe_t *ixgbe, const uint8_t *mac_addr,
 }
 
 /*
+ * ixgbe_unicst_find - Find the slot for the specified unicast address
+ */
+int
+ixgbe_unicst_find(ixgbe_t *ixgbe, const uint8_t *mac_addr)
+{
+	int slot;
+
+	ASSERT(mutex_owned(&ixgbe->gen_lock));
+
+	for (slot = 0; slot < ixgbe->unicst_total; slot++) {
+		if (bcmp(ixgbe->unicst_addr[slot].mac.addr,
+		    mac_addr, ETHERADDRL) == 0)
+			return (slot);
+	}
+
+	return (-1);
+}
+
+/*
  * ixgbe_multicst_add - Add a multicst address.
  */
 int
@@ -2153,7 +2247,7 @@ ixgbe_get_conf(ixgbe_t *ixgbe)
 	 * Ethernet flow control configuration
 	 */
 	flow_control = ixgbe_get_prop(ixgbe, PROP_FLOW_CONTROL,
-	    ixgbe_fc_none, 3, ixgbe_fc_full);
+	    ixgbe_fc_none, 3, ixgbe_fc_none);
 	if (flow_control == 3)
 		flow_control = ixgbe_fc_default;
 
@@ -2173,10 +2267,25 @@ ixgbe_get_conf(ixgbe_t *ixgbe)
 	    MIN_RX_RING_SIZE, MAX_RX_RING_SIZE, DEFAULT_RX_RING_SIZE);
 
 	/*
+	 * Multiple groups configuration
+	 */
+	ixgbe->num_rx_groups = ixgbe_get_prop(ixgbe, PROP_RX_GROUP_NUM,
+	    MIN_RX_GROUP_NUM, MAX_RX_GROUP_NUM, DEFAULT_RX_GROUP_NUM);
+
+	ixgbe->mr_enable = ixgbe_get_prop(ixgbe, PROP_MR_ENABLE,
+	    0, 1, DEFAULT_MR_ENABLE);
+
+	if (ixgbe->mr_enable == B_FALSE) {
+		ixgbe->num_tx_rings = 1;
+		ixgbe->num_rx_rings = 1;
+		ixgbe->num_rx_groups = 1;
+	}
+
+	/*
 	 * Tunable used to force an interrupt type. The only use is
 	 * for testing of the lesser interrupt types.
 	 * 0 = don't force interrupt type
-	 * 1 = force interrupt type MSIX
+	 * 1 = force interrupt type MSI-X
 	 * 2 = force interrupt type MSI
 	 * 3 = force interrupt type Legacy
 	 */
@@ -2413,6 +2522,7 @@ ixgbe_stall_check(ixgbe_t *ixgbe)
 	result = B_FALSE;
 	for (i = 0; i < ixgbe->num_tx_rings; i++) {
 		tx_ring = &ixgbe->tx_rings[i];
+		tx_ring->tx_recycle(tx_ring);
 
 		if (tx_ring->recycle_fail > 0)
 			tx_ring->stall_watchdog++;
@@ -2872,11 +2982,12 @@ ixgbe_intr_rx_work(ixgbe_rx_ring_t *rx_ring)
 
 	mutex_enter(&rx_ring->rx_lock);
 
-	mp = ixgbe_rx(rx_ring);
+	mp = ixgbe_ring_rx(rx_ring, IXGBE_POLL_NULL);
 	mutex_exit(&rx_ring->rx_lock);
 
 	if (mp != NULL)
-		mac_rx(rx_ring->ixgbe->mac_hdl, NULL, mp);
+		mac_rx_ring(rx_ring->ixgbe->mac_hdl, rx_ring->ring_handle, mp,
+		    rx_ring->ring_gen_num);
 }
 
 #pragma inline(ixgbe_intr_tx_work)
@@ -2897,7 +3008,8 @@ ixgbe_intr_tx_work(ixgbe_tx_ring_t *tx_ring)
 	if (tx_ring->reschedule &&
 	    (tx_ring->tbd_free >= tx_ring->resched_thresh)) {
 		tx_ring->reschedule = B_FALSE;
-		mac_tx_update(tx_ring->ixgbe->mac_hdl);
+		mac_tx_ring_update(tx_ring->ixgbe->mac_hdl,
+		    tx_ring->ring_handle);
 		IXGBE_DEBUG_STAT(tx_ring->stat_reschedule);
 	}
 }
@@ -2943,6 +3055,7 @@ ixgbe_intr_legacy(void *arg1, void *arg2)
 	ixgbe_t *ixgbe = (ixgbe_t *)arg1;
 	struct ixgbe_hw *hw = &ixgbe->hw;
 	ixgbe_tx_ring_t *tx_ring;
+	ixgbe_rx_ring_t *rx_ring;
 	uint32_t eicr;
 	mblk_t *mp;
 	boolean_t tx_reschedule;
@@ -2974,16 +3087,20 @@ ixgbe_intr_legacy(void *arg1, void *arg2)
 		ASSERT(ixgbe->num_tx_rings == 1);
 
 		/*
-		 * For legacy interrupt, we can't differentiate
-		 * between tx and rx, so always clean both
+		 * For legacy interrupt, rx rings[0] will use RTxQ[0].
 		 */
-		if (eicr & IXGBE_EICR_RTX_QUEUE) {
-
+		if (eicr & 0x1) {
 			/*
 			 * Clean the rx descriptors
 			 */
-			mp = ixgbe_rx(&ixgbe->rx_rings[0]);
+			rx_ring = &ixgbe->rx_rings[0];
+			mp = ixgbe_ring_rx(rx_ring, IXGBE_POLL_NULL);
+		}
 
+		/*
+		 * For legacy interrupt, tx rings[0] will use RTxQ[1].
+		 */
+		if (eicr & 0x2) {
 			/*
 			 * Recycle the tx descriptors
 			 */
@@ -3020,11 +3137,12 @@ ixgbe_intr_legacy(void *arg1, void *arg2)
 	 * Do the following work outside of the gen_lock
 	 */
 	if (mp != NULL)
-		mac_rx(ixgbe->mac_hdl, NULL, mp);
+		mac_rx_ring(rx_ring->ixgbe->mac_hdl, rx_ring->ring_handle, mp,
+		    rx_ring->ring_gen_num);
 
 	if (tx_reschedule)  {
 		tx_ring->reschedule = B_FALSE;
-		mac_tx_update(ixgbe->mac_hdl);
+		mac_tx_ring_update(ixgbe->mac_hdl, tx_ring->ring_handle);
 		IXGBE_DEBUG_STAT(tx_ring->stat_reschedule);
 	}
 
@@ -3055,11 +3173,16 @@ ixgbe_intr_msi(void *arg1, void *arg2)
 	ASSERT(ixgbe->num_tx_rings == 1);
 
 	/*
-	 * For MSI interrupt, we can't differentiate
-	 * between tx and rx, so always clean both.
+	 * For MSI interrupt, rx rings[0] will use RTxQ[0].
 	 */
-	if (eicr & IXGBE_EICR_RTX_QUEUE) {
+	if (eicr & 0x1) {
 		ixgbe_intr_rx_work(&ixgbe->rx_rings[0]);
+	}
+
+	/*
+	 * For MSI interrupt, tx rings[0] will use RTxQ[1].
+	 */
+	if (eicr & 0x2) {
 		ixgbe_intr_tx_work(&ixgbe->tx_rings[0]);
 	}
 
@@ -3071,38 +3194,47 @@ ixgbe_intr_msi(void *arg1, void *arg2)
 }
 
 /*
- * ixgbe_intr_rx - Interrupt handler for rx.
+ * ixgbe_intr_rx_tx - Interrupt handler for rx and tx.
  */
 static uint_t
-ixgbe_intr_rx(void *arg1, void *arg2)
+ixgbe_intr_rx_tx(void *arg1, void *arg2)
 {
 	_NOTE(ARGUNUSED(arg2));
-	ixgbe_ring_vector_t	*vect = (ixgbe_ring_vector_t *)arg1;
-	ixgbe_t			*ixgbe = vect->ixgbe;
-	int			r_idx;
+	ixgbe_ring_vector_t *vect = (ixgbe_ring_vector_t *)arg1;
+	ixgbe_t *ixgbe = vect->ixgbe;
+	int r_idx = 0;
 
 	/*
-	 * clean each rx ring that has its bit set in the map
+	 * Clean each rx ring that has its bit set in the map
 	 */
 	r_idx = bt_getlowbit(vect->rx_map, 0, (ixgbe->num_rx_rings - 1));
-
 	while (r_idx >= 0) {
 		ixgbe_intr_rx_work(&ixgbe->rx_rings[r_idx]);
 		r_idx = bt_getlowbit(vect->rx_map, (r_idx + 1),
 		    (ixgbe->num_rx_rings - 1));
 	}
 
+	/*
+	 * Clean each tx ring that has its bit set in the map
+	 */
+	r_idx = bt_getlowbit(vect->tx_map, 0, (ixgbe->num_tx_rings - 1));
+	while (r_idx >= 0) {
+		ixgbe_intr_tx_work(&ixgbe->tx_rings[r_idx]);
+		r_idx = bt_getlowbit(vect->tx_map, (r_idx + 1),
+		    (ixgbe->num_tx_rings - 1));
+	}
+
 	return (DDI_INTR_CLAIMED);
 }
 
 /*
- * ixgbe_intr_tx_other - Interrupt handler for both tx and other.
+ * ixgbe_intr_other - Interrupt handler for other.
  *
- * Always look for Tx cleanup work.  Only look for other work if the right
- * bits are set in the Interrupt Cause Register.
+ * Only look for other work if the right bits are set in the
+ * Interrupt Cause Register.
  */
 static uint_t
-ixgbe_intr_tx_other(void *arg1, void *arg2)
+ixgbe_intr_other(void *arg1, void *arg2)
 {
 	_NOTE(ARGUNUSED(arg2));
 	ixgbe_t *ixgbe = (ixgbe_t *)arg1;
@@ -3112,14 +3244,8 @@ ixgbe_intr_tx_other(void *arg1, void *arg2)
 	eicr = IXGBE_READ_REG(hw, IXGBE_EICR);
 
 	/*
-	 * Always look for Tx cleanup work.  We don't have separate
-	 * transmit vectors, so we have only one tx ring enabled.
-	 */
-	ASSERT(ixgbe->num_tx_rings == 1);
-	ixgbe_intr_tx_work(&ixgbe->tx_rings[0]);
-
-	/*
-	 * Check for "other" causes.
+	 * Need check cause bits and only link change will
+	 * be processed
 	 */
 	if (eicr & IXGBE_EICR_LSC) {
 		ixgbe_intr_other_work(ixgbe);
@@ -3174,12 +3300,13 @@ ixgbe_alloc_intrs(ixgbe_t *ixgbe)
 	}
 
 	/*
-	 * MSI-X not used, force rings to 1
+	 * MSI-X not used, force rings and groups to 1
 	 */
 	ixgbe->num_rx_rings = 1;
+	ixgbe->num_rx_groups = 1;
 	ixgbe->num_tx_rings = 1;
 	ixgbe_log(ixgbe,
-	    "MSI-X not used, force rx and tx queue number to 1");
+	    "MSI-X not used, force rings and groups number to 1");
 
 	/*
 	 * Install MSI interrupts
@@ -3217,30 +3344,19 @@ ixgbe_alloc_intrs(ixgbe_t *ixgbe)
  *
  * For legacy and MSI, only 1 handle is needed.  For MSI-X,
  * if fewer than 2 handles are available, return failure.
- * Upon success, this sets the number of Rx rings to a number that
- * matches the handles available for Rx interrupts.
+ * Upon success, this maps the vectors to rx and tx rings for
+ * interrupts.
  */
 static int
 ixgbe_alloc_intr_handles(ixgbe_t *ixgbe, int intr_type)
 {
 	dev_info_t *devinfo;
 	int request, count, avail, actual;
-	int rx_rings, minimum;
+	int minimum;
 	int rc;
 
 	devinfo = ixgbe->dip;
 
-	/*
-	 * Currently only 1 tx ring is supported. More tx rings
-	 * will be supported with future enhancement.
-	 */
-	if (ixgbe->num_tx_rings > 1) {
-		ixgbe->num_tx_rings = 1;
-		ixgbe_log(ixgbe,
-		    "Use only 1 MSI-X vector for tx, "
-		    "force tx queue number to 1");
-	}
-
 	switch (intr_type) {
 	case DDI_INTR_TYPE_FIXED:
 		request = 1;	/* Request 1 legacy interrupt handle */
@@ -3257,11 +3373,11 @@ ixgbe_alloc_intr_handles(ixgbe_t *ixgbe, int intr_type)
 	case DDI_INTR_TYPE_MSIX:
 		/*
 		 * Best number of vectors for the adapter is
-		 * # rx rings + # tx rings + 1 for other
-		 * But currently we only support number of vectors of
-		 * # rx rings + 1 for tx & other
+		 * # rx rings + # tx rings + 1 for other.
 		 */
-		request = ixgbe->num_rx_rings + 1;
+		request = ixgbe->num_rx_rings + ixgbe->num_tx_rings + 1;
+		if (request > (IXGBE_MAX_RING_VECTOR + 1))
+			request = IXGBE_MAX_RING_VECTOR + 1;
 		minimum = 2;
 		IXGBE_DEBUGLOG_0(ixgbe, "interrupt type: MSI-X");
 		break;
@@ -3327,9 +3443,8 @@ ixgbe_alloc_intr_handles(ixgbe_t *ixgbe, int intr_type)
 	ixgbe->intr_cnt = actual;
 
 	/*
-	 * Now we know the actual number of vectors.  Here we assume that
-	 * tx and other will share 1 vector and all remaining (must be at
-	 * least 1 remaining) will be used for rx.
+	 * Now we know the actual number of vectors.  Here we map the vector
+	 * to other, rx rings and tx ring.
 	 */
 	if (actual < minimum) {
 		ixgbe_log(ixgbe, "Insufficient interrupt handles available: %d",
@@ -3338,19 +3453,6 @@ ixgbe_alloc_intr_handles(ixgbe_t *ixgbe, int intr_type)
 	}
 
 	/*
-	 * For MSI-X, actual might force us to reduce number of rx rings
-	 */
-	if (intr_type == DDI_INTR_TYPE_MSIX) {
-		rx_rings = actual - 1;
-		if (rx_rings < ixgbe->num_rx_rings) {
-			ixgbe_log(ixgbe,
-			    "MSI-X vectors force Rx queue number to %d",
-			    rx_rings);
-			ixgbe->num_rx_rings = rx_rings;
-		}
-	}
-
-	/*
 	 * Get priority for first vector, assume remaining are all the same
 	 */
 	rc = ddi_intr_get_pri(ixgbe->htable[0], &ixgbe->intr_pri);
@@ -3386,56 +3488,47 @@ alloc_handle_fail:
 static int
 ixgbe_add_intr_handlers(ixgbe_t *ixgbe)
 {
-	ixgbe_rx_ring_t *rx_ring;
-	int vector;
+	int vector = 0;
 	int rc;
-	int i;
-
-	vector = 0;
 
 	switch (ixgbe->intr_type) {
 	case DDI_INTR_TYPE_MSIX:
 		/*
-		 * Add interrupt handler for tx + other
-		 */
-		rc = ddi_intr_add_handler(ixgbe->htable[vector],
-		    (ddi_intr_handler_t *)ixgbe_intr_tx_other,
-		    (void *)ixgbe, NULL);
-		if (rc != DDI_SUCCESS) {
-			ixgbe_log(ixgbe,
-			    "Add tx/other interrupt handler failed: %d", rc);
-			return (IXGBE_FAILURE);
-		}
-		vector++;
-
-		/*
-		 * Add interrupt handler for each rx ring
+		 * Add interrupt handler for rx and tx rings: vector[0 -
+		 * (ixgbe->intr_cnt -1)].
 		 */
-		for (i = 0; i < ixgbe->num_rx_rings; i++) {
-			rx_ring = &ixgbe->rx_rings[i];
-
+		for (vector = 0; vector < (ixgbe->intr_cnt -1); vector++) {
 			/*
 			 * install pointer to vect_map[vector]
 			 */
 			rc = ddi_intr_add_handler(ixgbe->htable[vector],
-			    (ddi_intr_handler_t *)ixgbe_intr_rx,
+			    (ddi_intr_handler_t *)ixgbe_intr_rx_tx,
 			    (void *)&ixgbe->vect_map[vector], NULL);
 
 			if (rc != DDI_SUCCESS) {
 				ixgbe_log(ixgbe,
 				    "Add rx interrupt handler failed. "
-				    "return: %d, rx ring: %d", rc, i);
+				    "return: %d, vector: %d", rc, vector);
 				for (vector--; vector >= 0; vector--) {
 					(void) ddi_intr_remove_handler(
 					    ixgbe->htable[vector]);
 				}
 				return (IXGBE_FAILURE);
 			}
+		}
 
-			rx_ring->intr_vector = vector;
-
-			vector++;
+		/*
+		 * Add interrupt handler for other: vector[ixgbe->intr_cnt -1]
+		 */
+		rc = ddi_intr_add_handler(ixgbe->htable[vector],
+		    (ddi_intr_handler_t *)ixgbe_intr_other,
+		    (void *)ixgbe, NULL);
+		if (rc != DDI_SUCCESS) {
+			ixgbe_log(ixgbe,
+			    "Add other interrupt handler failed: %d", rc);
+			return (IXGBE_FAILURE);
 		}
+
 		break;
 
 	case DDI_INTR_TYPE_MSI:
@@ -3452,10 +3545,6 @@ ixgbe_add_intr_handlers(ixgbe_t *ixgbe)
 			return (IXGBE_FAILURE);
 		}
 
-		rx_ring = &ixgbe->rx_rings[0];
-		rx_ring->intr_vector = vector;
-
-		vector++;
 		break;
 
 	case DDI_INTR_TYPE_FIXED:
@@ -3472,17 +3561,13 @@ ixgbe_add_intr_handlers(ixgbe_t *ixgbe)
 			return (IXGBE_FAILURE);
 		}
 
-		rx_ring = &ixgbe->rx_rings[0];
-		rx_ring->intr_vector = vector;
-
-		vector++;
 		break;
 
 	default:
 		return (IXGBE_FAILURE);
 	}
 
-	ASSERT(vector == ixgbe->intr_cnt);
+	ASSERT(vector == (ixgbe->intr_cnt -1));
 
 	return (IXGBE_SUCCESS);
 }
@@ -3509,6 +3594,7 @@ ixgbe_map_rxring_to_vector(ixgbe_t *ixgbe, int r_idx, int v_idx)
 	/*
 	 * Remember bit position
 	 */
+	ixgbe->rx_rings[r_idx].intr_vector = v_idx;
 	ixgbe->rx_rings[r_idx].vect_bit = 1 << v_idx;
 }
 
@@ -3534,48 +3620,81 @@ ixgbe_map_txring_to_vector(ixgbe_t *ixgbe, int t_idx, int v_idx)
 	/*
 	 * Remember bit position
 	 */
+	ixgbe->tx_rings[t_idx].intr_vector = v_idx;
 	ixgbe->tx_rings[t_idx].vect_bit = 1 << v_idx;
 }
 
 /*
- * ixgbe_set_ivar - Set the given entry in the given interrupt vector
+ * ixgbe_setup_ivar - Set the given entry in the given interrupt vector
  * allocation register (IVAR).
  */
 static void
-ixgbe_set_ivar(ixgbe_t *ixgbe, uint16_t int_alloc_entry, uint8_t msix_vector)
+ixgbe_setup_ivar(ixgbe_t *ixgbe, uint16_t intr_alloc_entry, uint8_t msix_vector)
 {
 	struct ixgbe_hw *hw = &ixgbe->hw;
 	u32 ivar, index;
 
 	msix_vector |= IXGBE_IVAR_ALLOC_VAL;
-	index = (int_alloc_entry >> 2) & 0x1F;
+	index = (intr_alloc_entry >> 2) & 0x1F;
+	ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(index));
+	ivar &= ~(0xFF << (8 * (intr_alloc_entry & 0x3)));
+	ivar |= (msix_vector << (8 * (intr_alloc_entry & 0x3)));
+	IXGBE_WRITE_REG(hw, IXGBE_IVAR(index), ivar);
+}
+
+/*
+ * ixgbe_enable_ivar - Enable the given entry by setting the VAL bit of
+ * given interrupt vector allocation register (IVAR).
+ */
+static void
+ixgbe_enable_ivar(ixgbe_t *ixgbe, uint16_t intr_alloc_entry)
+{
+	struct ixgbe_hw *hw = &ixgbe->hw;
+	u32 ivar, index;
+
+	index = (intr_alloc_entry >> 2) & 0x1F;
+	ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(index));
+	ivar |= (IXGBE_IVAR_ALLOC_VAL << (8 * (intr_alloc_entry & 0x3)));
+	IXGBE_WRITE_REG(hw, IXGBE_IVAR(index), ivar);
+}
+
+/*
+ * ixgbe_enable_ivar - Disble the given entry by clearing the VAL bit of
+ * given interrupt vector allocation register (IVAR).
+ */
+static void
+ixgbe_disable_ivar(ixgbe_t *ixgbe, uint16_t intr_alloc_entry)
+{
+	struct ixgbe_hw *hw = &ixgbe->hw;
+	u32 ivar, index;
+
+	index = (intr_alloc_entry >> 2) & 0x1F;
 	ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(index));
-	ivar &= ~(0xFF << (8 * (int_alloc_entry & 0x3)));
-	ivar |= (msix_vector << (8 * (int_alloc_entry & 0x3)));
+	ivar &= ~(IXGBE_IVAR_ALLOC_VAL << (8 * (intr_alloc_entry & 0x3)));
 	IXGBE_WRITE_REG(hw, IXGBE_IVAR(index), ivar);
 }
 
 /*
  * ixgbe_map_rings_to_vectors - Map descriptor rings to interrupt vectors.
  *
- * For msi-x, this currently implements only the scheme which is
- * 1 vector for tx + other, 1 vector for each rx ring.
+ * For MSI-X, here will map rx and tx ring to vector[0 - (vectors -1)].
+ * The last vector will be used for other interrupt.
  */
 static int
 ixgbe_map_rings_to_vectors(ixgbe_t *ixgbe)
 {
 	int i, vector = 0;
-	int vect_remain = ixgbe->intr_cnt;
 
 	/* initialize vector map */
 	bzero(&ixgbe->vect_map, sizeof (ixgbe->vect_map));
 
 	/*
-	 * non-MSI-X case is very simple: all interrupts on vector 0
+	 * non-MSI-X case is very simple: rx rings[0] on RTxQ[0],
+	 * tx rings[0] on RTxQ[1].
 	 */
 	if (ixgbe->intr_type != DDI_INTR_TYPE_MSIX) {
 		ixgbe_map_rxring_to_vector(ixgbe, 0, 0);
-		ixgbe_map_txring_to_vector(ixgbe, 0, 0);
+		ixgbe_map_txring_to_vector(ixgbe, 0, 1);
 		return (IXGBE_SUCCESS);
 	}
 
@@ -3584,16 +3703,19 @@ ixgbe_map_rings_to_vectors(ixgbe_t *ixgbe)
 	 */
 
 	/*
-	 * Map vector 0 to tx
+	 * Map vectors to rx rings
 	 */
-	ixgbe_map_txring_to_vector(ixgbe, 0, vector++);
-	vect_remain--;
+	for (i = 0; i < ixgbe->num_rx_rings; i++) {
+		ixgbe_map_rxring_to_vector(ixgbe, i, vector);
+		vector = (vector +1) % (ixgbe->intr_cnt -1);
+	}
 
 	/*
-	 * Map remaining vectors to rx rings
+	 * Map vectors to tx rings
 	 */
-	for (i = 0; i < vect_remain; i++) {
-		ixgbe_map_rxring_to_vector(ixgbe, i, vector++);
+	for (i = 0; i < ixgbe->num_tx_rings; i++) {
+		ixgbe_map_txring_to_vector(ixgbe, i, vector);
+		vector = (vector +1) % (ixgbe->intr_cnt -1);
 	}
 
 	return (IXGBE_SUCCESS);
@@ -3602,16 +3724,16 @@ ixgbe_map_rings_to_vectors(ixgbe_t *ixgbe)
 /*
  * ixgbe_setup_adapter_vector - Setup the adapter interrupt vector(s).
  *
- * This relies on queue/vector mapping already set up in the
+ * This relies on ring/vector mapping already set up in the
  * vect_map[] structures
  */
 static void
 ixgbe_setup_adapter_vector(ixgbe_t *ixgbe)
 {
 	struct ixgbe_hw *hw = &ixgbe->hw;
-	ixgbe_ring_vector_t	*vect;	/* vector bitmap */
-	int			r_idx;	/* ring index */
-	int			v_idx;	/* vector index */
+	ixgbe_ring_vector_t *vect;	/* vector bitmap */
+	int r_idx;	/* ring index */
+	int v_idx;	/* vector index */
 
 	/*
 	 * Clear any previous entries
@@ -3620,9 +3742,20 @@ ixgbe_setup_adapter_vector(ixgbe_t *ixgbe)
 		IXGBE_WRITE_REG(hw, IXGBE_IVAR(v_idx), 0);
 
 	/*
-	 * "Other" is always on vector 0
+	 * For non MSI-X interrupt, rx rings[0] will use RTxQ[0], and
+	 * tx rings[0] will use RTxQ[1].
+	 */
+	if (ixgbe->intr_type != DDI_INTR_TYPE_MSIX) {
+		ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(0), 0);
+		ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_TX_QUEUE(0), 1);
+		return;
+	}
+
+	/*
+	 * For MSI-X interrupt, "Other" is always on last vector.
 	 */
-	ixgbe_set_ivar(ixgbe, IXGBE_IVAR_OTHER_CAUSES_INDEX, 0);
+	ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_OTHER_CAUSES_INDEX,
+	    (ixgbe->intr_cnt - 1));
 
 	/*
 	 * For each interrupt vector, populate the IVAR table
@@ -3637,7 +3770,7 @@ ixgbe_setup_adapter_vector(ixgbe_t *ixgbe)
 		    (ixgbe->num_rx_rings - 1));
 
 		while (r_idx >= 0) {
-			ixgbe_set_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(r_idx),
+			ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(r_idx),
 			    v_idx);
 			r_idx = bt_getlowbit(vect->rx_map, (r_idx + 1),
 			    (ixgbe->num_rx_rings - 1));
@@ -3650,7 +3783,7 @@ ixgbe_setup_adapter_vector(ixgbe_t *ixgbe)
 		    (ixgbe->num_tx_rings - 1));
 
 		while (r_idx >= 0) {
-			ixgbe_set_ivar(ixgbe, IXGBE_IVAR_TX_QUEUE(r_idx),
+			ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_TX_QUEUE(r_idx),
 			    v_idx);
 			r_idx = bt_getlowbit(vect->tx_map, (r_idx + 1),
 			    (ixgbe->num_tx_rings - 1));
@@ -3996,3 +4129,231 @@ ixgbe_fm_ereport(ixgbe_t *ixgbe, char *detail)
 		    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, NULL);
 	}
 }
+
+static int
+ixgbe_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
+{
+	ixgbe_rx_ring_t *rx_ring = (ixgbe_rx_ring_t *)rh;
+
+	mutex_enter(&rx_ring->rx_lock);
+	rx_ring->ring_gen_num = mr_gen_num;
+	mutex_exit(&rx_ring->rx_lock);
+	return (0);
+}
+
+/*
+ * Callback funtion for MAC layer to register all rings.
+ */
+/* ARGSUSED */
+void
+ixgbe_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
+    const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+	ixgbe_t *ixgbe = (ixgbe_t *)arg;
+	mac_intr_t *mintr = &infop->mri_intr;
+
+	switch (rtype) {
+	case MAC_RING_TYPE_RX: {
+		ASSERT(rg_index == 0);
+		ASSERT(ring_index < ixgbe->num_rx_rings);
+
+		ixgbe_rx_ring_t *rx_ring = &ixgbe->rx_rings[ring_index];
+		rx_ring->ring_handle = rh;
+
+		infop->mri_driver = (mac_ring_driver_t)rx_ring;
+		infop->mri_start = ixgbe_ring_start;
+		infop->mri_stop = NULL;
+		infop->mri_poll = ixgbe_ring_rx_poll;
+
+		mintr->mi_handle = (mac_intr_handle_t)rx_ring;
+		mintr->mi_enable = ixgbe_rx_ring_intr_enable;
+		mintr->mi_disable = ixgbe_rx_ring_intr_disable;
+
+		break;
+	}
+	case MAC_RING_TYPE_TX: {
+		ASSERT(rg_index == -1);
+		ASSERT(ring_index < ixgbe->num_tx_rings);
+
+		ixgbe_tx_ring_t *tx_ring = &ixgbe->tx_rings[ring_index];
+		tx_ring->ring_handle = rh;
+
+		infop->mri_driver = (mac_ring_driver_t)tx_ring;
+		infop->mri_start = NULL;
+		infop->mri_stop = NULL;
+		infop->mri_tx = ixgbe_ring_tx;
+
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+/*
+ * Callback funtion for MAC layer to register all groups.
+ */
+void
+ixgbe_fill_group(void *arg, mac_ring_type_t rtype, const int index,
+    mac_group_info_t *infop, mac_group_handle_t gh)
+{
+	ixgbe_t *ixgbe = (ixgbe_t *)arg;
+
+	switch (rtype) {
+	case MAC_RING_TYPE_RX: {
+		ixgbe_rx_group_t *rx_group;
+
+		rx_group = &ixgbe->rx_groups[index];
+		rx_group->group_handle = gh;
+
+		infop->mgi_driver = (mac_group_driver_t)rx_group;
+		infop->mgi_start = NULL;
+		infop->mgi_stop = NULL;
+		infop->mgi_addmac = ixgbe_addmac;
+		infop->mgi_remmac = ixgbe_remmac;
+		infop->mgi_count = (ixgbe->num_rx_rings / ixgbe->num_rx_groups);
+
+		break;
+	}
+	case MAC_RING_TYPE_TX:
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * Enable interrupt on the specificed rx ring.
+ */
+int
+ixgbe_rx_ring_intr_enable(mac_intr_handle_t intrh)
+{
+	ixgbe_rx_ring_t *rx_ring = (ixgbe_rx_ring_t *)intrh;
+	ixgbe_t *ixgbe = rx_ring->ixgbe;
+	int r_idx = rx_ring->index;
+	int v_idx = rx_ring->intr_vector;
+
+	mutex_enter(&ixgbe->gen_lock);
+	ASSERT(BT_TEST(ixgbe->vect_map[v_idx].rx_map, r_idx) == 0);
+
+	/*
+	 * To enable interrupt by setting the VAL bit of given interrupt
+	 * vector allocation register (IVAR).
+	 */
+	ixgbe_enable_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(r_idx));
+
+	BT_SET(ixgbe->vect_map[v_idx].rx_map, r_idx);
+	mutex_exit(&ixgbe->gen_lock);
+
+	return (0);
+}
+
+/*
+ * Disable interrupt on the specificed rx ring.
+ */
+int
+ixgbe_rx_ring_intr_disable(mac_intr_handle_t intrh)
+{
+	ixgbe_rx_ring_t *rx_ring = (ixgbe_rx_ring_t *)intrh;
+	ixgbe_t *ixgbe = rx_ring->ixgbe;
+	int r_idx = rx_ring->index;
+	int v_idx = rx_ring->intr_vector;
+
+	mutex_enter(&ixgbe->gen_lock);
+
+	ASSERT(BT_TEST(ixgbe->vect_map[v_idx].rx_map, r_idx) == 1);
+
+	/*
+	 * To disable interrupt by clearing the VAL bit of given interrupt
+	 * vector allocation register (IVAR).
+	 */
+	ixgbe_disable_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(r_idx));
+
+	BT_CLEAR(ixgbe->vect_map[v_idx].rx_map, r_idx);
+
+	mutex_exit(&ixgbe->gen_lock);
+
+	return (0);
+}
+
+/*
+ * Add a mac address.
+ */
+static int
+ixgbe_addmac(void *arg, const uint8_t *mac_addr)
+{
+	ixgbe_rx_group_t *rx_group = (ixgbe_rx_group_t *)arg;
+	ixgbe_t *ixgbe = rx_group->ixgbe;
+	int slot;
+	int err;
+
+	mutex_enter(&ixgbe->gen_lock);
+
+	if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
+		mutex_exit(&ixgbe->gen_lock);
+		return (ECANCELED);
+	}
+
+	if (ixgbe->unicst_avail == 0) {
+		/* no slots available */
+		mutex_exit(&ixgbe->gen_lock);
+		return (ENOSPC);
+	}
+
+	for (slot = 0; slot < ixgbe->unicst_total; slot++) {
+		if (ixgbe->unicst_addr[slot].mac.set == 0)
+			break;
+	}
+
+	ASSERT((slot >= 0) && (slot < ixgbe->unicst_total));
+
+	if ((err = ixgbe_unicst_set(ixgbe, mac_addr, slot)) == 0) {
+		ixgbe->unicst_addr[slot].mac.set = 1;
+		ixgbe->unicst_avail--;
+	}
+
+	mutex_exit(&ixgbe->gen_lock);
+
+	return (err);
+}
+
+/*
+ * Remove a mac address.
+ */
+static int
+ixgbe_remmac(void *arg, const uint8_t *mac_addr)
+{
+	ixgbe_rx_group_t *rx_group = (ixgbe_rx_group_t *)arg;
+	ixgbe_t *ixgbe = rx_group->ixgbe;
+	int slot;
+	int err;
+
+	mutex_enter(&ixgbe->gen_lock);
+
+	if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) {
+		mutex_exit(&ixgbe->gen_lock);
+		return (ECANCELED);
+	}
+
+	slot = ixgbe_unicst_find(ixgbe, mac_addr);
+	if (slot == -1) {
+		mutex_exit(&ixgbe->gen_lock);
+		return (EINVAL);
+	}
+
+	if (ixgbe->unicst_addr[slot].mac.set == 0) {
+		mutex_exit(&ixgbe->gen_lock);
+		return (EINVAL);
+	}
+
+	bzero(ixgbe->unicst_addr[slot].mac.addr, ETHERADDRL);
+	if ((err = ixgbe_unicst_set(ixgbe,
+	    ixgbe->unicst_addr[slot].mac.addr, slot)) == 0) {
+		ixgbe->unicst_addr[slot].mac.set = 0;
+		ixgbe->unicst_avail++;
+	}
+
+	mutex_exit(&ixgbe->gen_lock);
+
+	return (err);
+}
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_rx.c b/usr/src/uts/common/io/ixgbe/ixgbe_rx.c
index 3f09a4215d..63e42cede2 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_rx.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_rx.c
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *      http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
 
 #include "ixgbe_sw.h"
 
@@ -176,7 +176,10 @@ ixgbe_rx_bind(ixgbe_rx_ring_t *rx_ring, uint32_t index, uint32_t pkt_len)
 		 * DMA buffer, we have to return and use bcopy to
 		 * process the packet.
 		 */
-		if (current_rcb->mp == NULL) {
+		if (current_rcb->mp != NULL) {
+			current_rcb->mp->b_rptr += IPHDR_ALIGN_ROOM;
+			current_rcb->mp->b_wptr += IPHDR_ALIGN_ROOM;
+		} else {
 			atomic_inc_32(&rx_ring->rcb_free);
 			return (NULL);
 		}
@@ -246,7 +249,7 @@ ixgbe_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error)
 }
 
 /*
- * ixgbe_rx - Receive the data of one ring.
+ * ixgbe_ring_rx - Receive the data of one ring.
  *
  * This function goes throught h/w descriptor in one specified rx ring,
  * receives the data if the descriptor status shows the data is ready.
@@ -254,7 +257,7 @@ ixgbe_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error)
  * passed up to mac_rx().
  */
 mblk_t *
-ixgbe_rx(ixgbe_rx_ring_t *rx_ring)
+ixgbe_ring_rx(ixgbe_rx_ring_t *rx_ring, int poll_bytes)
 {
 	union ixgbe_adv_rx_desc *current_rbd;
 	rx_control_block_t *current_rcb;
@@ -266,6 +269,7 @@ ixgbe_rx(ixgbe_rx_ring_t *rx_ring)
 	uint32_t pkt_len;
 	uint32_t status_error;
 	uint32_t pkt_num;
+	uint32_t received_bytes;
 	ixgbe_t *ixgbe = rx_ring->ixgbe;
 	struct ixgbe_hw *hw = &ixgbe->hw;
 
@@ -289,6 +293,7 @@ ixgbe_rx(ixgbe_rx_ring_t *rx_ring)
 	rx_next = rx_ring->rbd_next;
 
 	current_rbd = &rx_ring->rbd_ring[rx_next];
+	received_bytes = 0;
 	pkt_num = 0;
 	status_error = current_rbd->wb.upper.status_error;
 	while (status_error & IXGBE_RXD_STAT_DD) {
@@ -309,6 +314,13 @@ ixgbe_rx(ixgbe_rx_ring_t *rx_ring)
 		    (status_error & IXGBE_RXDADV_ERR_IPE));
 
 		pkt_len = current_rbd->wb.upper.length;
+
+		if ((poll_bytes != IXGBE_POLL_NULL) &&
+		    ((received_bytes + pkt_len) > poll_bytes))
+			break;
+
+		received_bytes += pkt_len;
+
 		mp = NULL;
 		/*
 		 * For packets with length more than the copy threshold,
@@ -378,3 +390,21 @@ rx_discard:
 
 	return (mblk_head);
 }
+
+mblk_t *
+ixgbe_ring_rx_poll(void *arg, int n_bytes)
+{
+	ixgbe_rx_ring_t *rx_ring = (ixgbe_rx_ring_t *)arg;
+	mblk_t *mp = NULL;
+
+	ASSERT(n_bytes >= 0);
+
+	if (n_bytes == 0)
+		return (mp);
+
+	mutex_enter(&rx_ring->rx_lock);
+	mp = ixgbe_ring_rx(rx_ring, n_bytes);
+	mutex_exit(&rx_ring->rx_lock);
+
+	return (mp);
+}
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_stat.c b/usr/src/uts/common/io/ixgbe/ixgbe_stat.c
index 776af1fba4..00eccf23a2 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_stat.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_stat.c
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *      http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,11 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
 
 #include "ixgbe_sw.h"
 
@@ -87,17 +87,29 @@ ixgbe_update_stats(kstat_t *ks, int rw)
 		ixgbe_ks->tx_reschedule.value.ui64 +=
 		    ixgbe->tx_rings[i].stat_reschedule;
 	}
+#endif
 
 	/*
 	 * Hardware calculated statistics.
 	 */
+	ixgbe_ks->gprc.value.ui64 = 0;
+	ixgbe_ks->gptc.value.ui64 = 0;
+	ixgbe_ks->tor.value.ui64 = 0;
+	ixgbe_ks->tot.value.ui64 = 0;
 	for (i = 0; i < 16; i++) {
-		ixgbe_ks->gprc.value.ul += IXGBE_READ_REG(hw, IXGBE_QPRC(i));
-		ixgbe_ks->gptc.value.ul += IXGBE_READ_REG(hw, IXGBE_QPTC(i));
-		ixgbe_ks->tor.value.ui64 += IXGBE_READ_REG(hw, IXGBE_QBRC(i));
-		ixgbe_ks->tot.value.ui64 += IXGBE_READ_REG(hw, IXGBE_QBTC(i));
+		ixgbe_ks->qprc[i].value.ui64 +=
+		    IXGBE_READ_REG(hw, IXGBE_QPRC(i));
+		ixgbe_ks->gprc.value.ui64 += ixgbe_ks->qprc[i].value.ui64;
+		ixgbe_ks->qptc[i].value.ui64 +=
+		    IXGBE_READ_REG(hw, IXGBE_QPTC(i));
+		ixgbe_ks->gptc.value.ui64 += ixgbe_ks->qptc[i].value.ui64;
+		ixgbe_ks->qbrc[i].value.ui64 +=
+		    IXGBE_READ_REG(hw, IXGBE_QBRC(i));
+		ixgbe_ks->tor.value.ui64 += ixgbe_ks->qbrc[i].value.ui64;
+		ixgbe_ks->qbtc[i].value.ui64 +=
+		    IXGBE_READ_REG(hw, IXGBE_QBTC(i));
+		ixgbe_ks->tot.value.ui64 += ixgbe_ks->qbtc[i].value.ui64;
 	}
-
 	/*
 	 * This is a Workaround:
 	 * Currently h/w GORCH, GOTCH, TORH registers are not
@@ -124,7 +136,6 @@ ixgbe_update_stats(kstat_t *ks, int rw)
 	ixgbe_ks->ptc511.value.ul += IXGBE_READ_REG(hw, IXGBE_PTC511);
 	ixgbe_ks->ptc1023.value.ul += IXGBE_READ_REG(hw, IXGBE_PTC1023);
 	ixgbe_ks->ptc1522.value.ul += IXGBE_READ_REG(hw, IXGBE_PTC1522);
-#endif
 
 	ixgbe_ks->mspdc.value.ui64 += IXGBE_READ_REG(hw, IXGBE_MSPDC);
 	for (i = 0; i < 8; i++)
@@ -200,6 +211,7 @@ ixgbe_init_stats(ixgbe_t *ixgbe)
 	    KSTAT_DATA_UINT64);
 	kstat_named_init(&ixgbe_ks->tx_reschedule, "tx_reschedule",
 	    KSTAT_DATA_UINT64);
+#endif
 
 	kstat_named_init(&ixgbe_ks->gprc, "good_pkts_recvd",
 	    KSTAT_DATA_UINT64);
@@ -233,7 +245,138 @@ ixgbe_init_stats(ixgbe_t *ixgbe)
 	    KSTAT_DATA_UINT64);
 	kstat_named_init(&ixgbe_ks->ptc1522, "pkts_xmitd_(1024-1522b)",
 	    KSTAT_DATA_UINT64);
-#endif
+
+	kstat_named_init(&ixgbe_ks->qprc[0], "queue_pkts_recvd [ 0]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[1], "queue_pkts_recvd [ 1]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[2], "queue_pkts_recvd [ 2]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[3], "queue_pkts_recvd [ 3]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[4], "queue_pkts_recvd [ 4]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[5], "queue_pkts_recvd [ 5]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[6], "queue_pkts_recvd [ 6]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[7], "queue_pkts_recvd [ 7]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[8], "queue_pkts_recvd [ 8]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[9], "queue_pkts_recvd [ 9]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[10], "queue_pkts_recvd [10]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[11], "queue_pkts_recvd [11]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[12], "queue_pkts_recvd [12]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[13], "queue_pkts_recvd [13]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[14], "queue_pkts_recvd [14]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qprc[15], "queue_pkts_recvd [15]",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ixgbe_ks->qptc[0], "queue_pkts_xmitd [ 0]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[1], "queue_pkts_xmitd [ 1]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[2], "queue_pkts_xmitd [ 2]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[3], "queue_pkts_xmitd [ 3]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[4], "queue_pkts_xmitd [ 4]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[5], "queue_pkts_xmitd [ 5]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[6], "queue_pkts_xmitd [ 6]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[7], "queue_pkts_xmitd [ 7]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[8], "queue_pkts_xmitd [ 8]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[9], "queue_pkts_xmitd [ 9]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[10], "queue_pkts_xmitd [10]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[11], "queue_pkts_xmitd [11]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[12], "queue_pkts_xmitd [12]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[13], "queue_pkts_xmitd [13]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[14], "queue_pkts_xmitd [14]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qptc[15], "queue_pkts_xmitd [15]",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ixgbe_ks->qbrc[0], "queue_bytes_recvd [ 0]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[1], "queue_bytes_recvd [ 1]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[2], "queue_bytes_recvd [ 2]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[3], "queue_bytes_recvd [ 3]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[4], "queue_bytes_recvd [ 4]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[5], "queue_bytes_recvd [ 5]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[6], "queue_bytes_recvd [ 6]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[7], "queue_bytes_recvd [ 7]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[8], "queue_bytes_recvd [ 8]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[9], "queue_bytes_recvd [ 9]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[10], "queue_bytes_recvd [10]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[11], "queue_bytes_recvd [11]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[12], "queue_bytes_recvd [12]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[13], "queue_bytes_recvd [13]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[14], "queue_bytes_recvd [14]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbrc[15], "queue_bytes_recvd [15]",
+	    KSTAT_DATA_UINT64);
+
+	kstat_named_init(&ixgbe_ks->qbtc[0], "queue_bytes_xmitd [ 0]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[1], "queue_bytes_xmitd [ 1]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[2], "queue_bytes_xmitd [ 2]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[3], "queue_bytes_xmitd [ 3]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[4], "queue_bytes_xmitd [ 4]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[5], "queue_bytes_xmitd [ 5]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[6], "queue_bytes_xmitd [ 6]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[7], "queue_bytes_xmitd [ 7]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[8], "queue_bytes_xmitd [ 8]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[9], "queue_bytes_xmitd [ 9]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[10], "queue_bytes_xmitd [10]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[11], "queue_bytes_xmitd [11]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[12], "queue_bytes_xmitd [12]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[13], "queue_bytes_xmitd [13]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[14], "queue_bytes_xmitd [14]",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ixgbe_ks->qbtc[15], "queue_bytes_xmitd [15]",
+	    KSTAT_DATA_UINT64);
 
 	kstat_named_init(&ixgbe_ks->mspdc, "mac_short_packet_discard",
 	    KSTAT_DATA_UINT64);
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h
index 390233fff5..f648c57a18 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *      http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,15 +20,17 @@
  */
 
 /*
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
+ */
+
+/*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Use is subject to license terms.
  */
 
 #ifndef	_IXGBE_SW_H
 #define	_IXGBE_SW_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -48,7 +48,7 @@ extern "C" {
 #include <sys/modctl.h>
 #include <sys/errno.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/vlan.h>
 #include <sys/ddi.h>
@@ -89,6 +89,8 @@ extern "C" {
 #define	IXGBE_INTR_MSI			2
 #define	IXGBE_INTR_LEGACY		3
 
+#define	IXGBE_POLL_NULL			-1
+
 #define	MAX_COOKIE			18
 #define	MIN_NUM_TX_DESC			2
 
@@ -102,6 +104,7 @@ extern "C" {
  */
 #define	MAX_TX_QUEUE_NUM		32
 #define	MAX_RX_QUEUE_NUM		64
+#define	MAX_RX_GROUP_NUM		1
 
 #define	MAX_TX_RING_SIZE		4096
 #define	MAX_RX_RING_SIZE		4096
@@ -121,6 +124,7 @@ extern "C" {
  */
 #define	MIN_TX_QUEUE_NUM		1
 #define	MIN_RX_QUEUE_NUM		1
+#define	MIN_RX_GROUP_NUM		1
 #define	MIN_TX_RING_SIZE		64
 #define	MIN_RX_RING_SIZE		64
 
@@ -136,17 +140,18 @@ extern "C" {
 /*
  * Default values for user configurable parameters
  */
-#define	DEFAULT_TX_QUEUE_NUM		1
-#define	DEFAULT_RX_QUEUE_NUM		1
-#define	DEFAULT_TX_RING_SIZE		512
-#define	DEFAULT_RX_RING_SIZE		512
+#define	DEFAULT_TX_QUEUE_NUM		8
+#define	DEFAULT_RX_QUEUE_NUM		8
+#define	DEFAULT_RX_GROUP_NUM		1
+#define	DEFAULT_TX_RING_SIZE		1024
+#define	DEFAULT_RX_RING_SIZE		1024
 
 #define	DEFAULT_MTU			ETHERMTU
 #define	DEFAULT_RX_LIMIT_PER_INTR	256
 #define	DEFAULT_INTR_THROTTLING		200	/* In unit of 256 nsec */
 #define	DEFAULT_RX_COPY_THRESHOLD	128
 #define	DEFAULT_TX_COPY_THRESHOLD	512
-#define	DEFAULT_TX_RECYCLE_THRESHOLD	MAX_COOKIE
+#define	DEFAULT_TX_RECYCLE_THRESHOLD	(MAX_COOKIE + 1)
 #define	DEFAULT_TX_OVERLOAD_THRESHOLD	MIN_NUM_TX_DESC
 #define	DEFAULT_TX_RESCHED_THRESHOLD	128
 #define	DEFAULT_FCRTH			0x20000
@@ -156,6 +161,14 @@ extern "C" {
 #define	DEFAULT_TX_HCKSUM_ENABLE	B_TRUE
 #define	DEFAULT_RX_HCKSUM_ENABLE	B_TRUE
 #define	DEFAULT_LSO_ENABLE		B_TRUE
+#define	DEFAULT_MR_ENABLE		B_TRUE
+#define	DEFAULT_TX_HEAD_WB_ENABLE	B_TRUE
+
+#define	IXGBE_LSO_MAXLEN		65535
+
+#define	DEFAULT_TX_HCKSUM_ENABLE	B_TRUE
+#define	DEFAULT_RX_HCKSUM_ENABLE	B_TRUE
+#define	DEFAULT_LSO_ENABLE		B_TRUE
 #define	DEFAULT_TX_HEAD_WB_ENABLE	B_TRUE
 
 #define	IXGBE_LSO_MAXLEN	65535
@@ -167,11 +180,12 @@ extern "C" {
 #define	MAX_LINK_DOWN_TIMEOUT		8	/* 8 seconds */
 
 /*
- * limits on msi-x vectors for 82598
+ * Limits on msi-x vectors for 82598
  */
-#define	IXGBE_MAX_INTR_VECTOR  18
-#define	IXGBE_MAX_OTHER_VECTOR 2
-#define	IXGBE_MAX_RING_VECTOR (IXGBE_MAX_INTR_VECTOR - IXGBE_MAX_OTHER_VECTOR)
+#define	IXGBE_MAX_INTR_VECTOR		18
+#define	IXGBE_MAX_OTHER_VECTOR		1
+#define	IXGBE_MAX_TCP_TIMER_VECTOR	1
+#define	IXGBE_MAX_RING_VECTOR		16
 
 /*
  * Extra register bit masks for 82598
@@ -209,11 +223,13 @@ extern "C" {
 #define	PROP_TX_RING_SIZE		"tx_ring_size"
 #define	PROP_RX_QUEUE_NUM		"rx_queue_number"
 #define	PROP_RX_RING_SIZE		"rx_ring_size"
+#define	PROP_RX_GROUP_NUM		"rx_group_number"
 
 #define	PROP_INTR_FORCE			"intr_force"
 #define	PROP_TX_HCKSUM_ENABLE		"tx_hcksum_enable"
 #define	PROP_RX_HCKSUM_ENABLE		"rx_hcksum_enable"
 #define	PROP_LSO_ENABLE			"lso_enable"
+#define	PROP_MR_ENABLE			"mr_enable"
 #define	PROP_TX_HEAD_WB_ENABLE		"tx_head_wb_enable"
 #define	PROP_TX_COPY_THRESHOLD		"tx_copy_threshold"
 #define	PROP_TX_RECYCLE_THRESHOLD	"tx_recycle_threshold"
@@ -264,9 +280,6 @@ enum ioc_reply {
 	IOC_REPLY	/* OK, just send reply */
 };
 
-#define	MBLK_LEN(mp)		((uintptr_t)(mp)->b_wptr - \
-				(uintptr_t)(mp)->b_rptr)
-
 #define	DMA_SYNC(area, flag)	((void) ddi_dma_sync((area)->dma_handle, \
 				    0, 0, (flag)))
 
@@ -533,13 +546,15 @@ typedef struct ixgbe_tx_ring {
 	uint32_t		stat_fail_no_tcb;
 	uint32_t		stat_fail_dma_bind;
 	uint32_t		stat_reschedule;
+	uint32_t		stat_lso_header_fail;
 #endif
 
+	mac_ring_handle_t	ring_handle;
+
 	/*
 	 * Pointer to the ixgbe struct
 	 */
 	struct ixgbe		*ixgbe;
-
 } ixgbe_tx_ring_t;
 
 /*
@@ -590,11 +605,22 @@ typedef struct ixgbe_rx_ring {
 	uint32_t		stat_exceed_pkt;
 #endif
 
-	struct ixgbe		*ixgbe;		/* Pointer to ixgbe struct */
+	mac_ring_handle_t	ring_handle;
+	uint64_t		ring_gen_num;
 
+	struct ixgbe		*ixgbe;		/* Pointer to ixgbe struct */
 } ixgbe_rx_ring_t;
 
 /*
+ * Software Receive Ring Group
+ */
+typedef struct ixgbe_rx_group {
+	uint32_t		index;		/* Group index */
+	mac_group_handle_t	group_handle;   /* call back group handle */
+	struct ixgbe		*ixgbe;		/* Pointer to ixgbe struct */
+} ixgbe_rx_group_t;
+
+/*
  * structure to map ring cleanup to msi-x vector
  */
 typedef struct ixgbe_ring_vector {
@@ -641,6 +667,12 @@ typedef struct ixgbe {
 	uint32_t		rx_buf_size;	/* Rx buffer size */
 
 	/*
+	 * Receive Groups
+	 */
+	ixgbe_rx_group_t	*rx_groups;	/* Array of rx groups */
+	uint32_t		num_rx_groups;	/* Number of rx groups in use */
+
+	/*
 	 * Transmit Rings
 	 */
 	ixgbe_tx_ring_t		*tx_rings;	/* Array of tx rings */
@@ -651,6 +683,7 @@ typedef struct ixgbe {
 	boolean_t		tx_head_wb_enable; /* Tx head wrtie-back */
 	boolean_t		tx_hcksum_enable; /* Tx h/w cksum offload */
 	boolean_t 		lso_enable; 	/* Large Segment Offload */
+	boolean_t 		mr_enable; 	/* Multiple Tx and Rx Ring */
 	uint32_t		tx_copy_thresh;	/* Tx copy threshold */
 	uint32_t		tx_recycle_thresh; /* Tx recycle threshold */
 	uint32_t		tx_overload_thresh; /* Tx overload threshold */
@@ -684,6 +717,8 @@ typedef struct ixgbe {
 	uint32_t		mcast_count;
 	struct ether_addr	mcast_table[MAX_NUM_MULTICAST_ADDRESSES];
 
+	ulong_t			sys_page_size;
+
 	/*
 	 * Kstat definitions
 	 */
@@ -694,13 +729,11 @@ typedef struct ixgbe {
 	 */
 	caddr_t			nd_data;
 	nd_param_t		nd_params[PARAM_COUNT];
-
 } ixgbe_t;
 
 typedef struct ixgbe_stat {
-
 	kstat_named_t link_speed;	/* Link Speed */
-#ifdef IXGBE_DEBUG
+
 	kstat_named_t reset_count;	/* Reset Count */
 
 	kstat_named_t rx_frame_error;	/* Rx Error in Packet */
@@ -729,7 +762,11 @@ typedef struct ixgbe_stat {
 	kstat_named_t ptc511;	/* Packets Xmitted (255-511b) */
 	kstat_named_t ptc1023;	/* Packets Xmitted (512-1023b) */
 	kstat_named_t ptc1522;	/* Packets Xmitted (1024-1522b */
-#endif
+	kstat_named_t qprc[16];	/* Queue Packets Received Count */
+	kstat_named_t qptc[16];	/* Queue Packets Transmitted Count */
+	kstat_named_t qbrc[16];	/* Queue Bytes Received Count */
+	kstat_named_t qbtc[16];	/* Queue Bytes Transmitted Count */
+
 	kstat_named_t crcerrs;	/* CRC Error Count */
 	kstat_named_t illerrc;	/* Illegal Byte Error Count */
 	kstat_named_t errbc;	/* Error Byte Count */
@@ -770,7 +807,6 @@ void ixgbe_set_fma_flags(int, int);
 int ixgbe_start(ixgbe_t *);
 void ixgbe_stop(ixgbe_t *);
 int ixgbe_driver_setup_link(ixgbe_t *, boolean_t);
-int ixgbe_unicst_set(ixgbe_t *, const uint8_t *, mac_addr_slot_t);
 int ixgbe_multicst_add(ixgbe_t *, const uint8_t *);
 int ixgbe_multicst_remove(ixgbe_t *, const uint8_t *);
 enum ioc_reply ixgbe_loopback_ioctl(ixgbe_t *, struct iocblk *, mblk_t *);
@@ -783,6 +819,13 @@ int ixgbe_check_acc_handle(ddi_acc_handle_t handle);
 int ixgbe_check_dma_handle(ddi_dma_handle_t handle);
 void ixgbe_fm_ereport(ixgbe_t *, char *);
 
+void ixgbe_fill_ring(void *, mac_ring_type_t, const int, const int,
+    mac_ring_info_t *, mac_ring_handle_t);
+void ixgbe_fill_group(void *arg, mac_ring_type_t, const int,
+    mac_group_info_t *, mac_group_handle_t);
+int ixgbe_rx_ring_intr_enable(mac_intr_handle_t);
+int ixgbe_rx_ring_intr_disable(mac_intr_handle_t);
+
 /*
  * Function prototypes in ixgbe_gld.c
  */
@@ -790,26 +833,22 @@ int ixgbe_m_start(void *);
 void ixgbe_m_stop(void *);
 int ixgbe_m_promisc(void *, boolean_t);
 int ixgbe_m_multicst(void *, boolean_t, const uint8_t *);
-int ixgbe_m_unicst(void *, const uint8_t *);
 int ixgbe_m_stat(void *, uint_t, uint64_t *);
 void ixgbe_m_resources(void *);
 void ixgbe_m_ioctl(void *, queue_t *, mblk_t *);
-int ixgbe_m_unicst_add(void *, mac_multi_addr_t *);
-int ixgbe_m_unicst_remove(void *, mac_addr_slot_t);
-int ixgbe_m_unicst_modify(void *, mac_multi_addr_t *);
-int ixgbe_m_unicst_get(void *, mac_multi_addr_t *);
 boolean_t ixgbe_m_getcapab(void *, mac_capab_t, void *);
 
 /*
  * Function prototypes in ixgbe_rx.c
  */
-mblk_t *ixgbe_rx(ixgbe_rx_ring_t *);
+mblk_t *ixgbe_ring_rx(ixgbe_rx_ring_t *, int);
 void ixgbe_rx_recycle(caddr_t arg);
+mblk_t *ixgbe_ring_rx_poll(void *, int);
 
 /*
  * Function prototypes in ixgbe_tx.c
  */
-mblk_t *ixgbe_m_tx(void *, mblk_t *);
+mblk_t *ixgbe_ring_tx(void *, mblk_t *);
 void ixgbe_free_tcb(tx_control_block_t *);
 void ixgbe_put_free_list(ixgbe_tx_ring_t *, link_list_t *);
 uint32_t ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *);
@@ -834,7 +873,6 @@ enum ioc_reply ixgbe_nd_ioctl(ixgbe_t *, queue_t *, mblk_t *, struct iocblk *);
  */
 int ixgbe_init_stats(ixgbe_t *);
 
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_tx.c b/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
index f2a5d8fa0c..721353c756 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
@@ -1,19 +1,17 @@
 /*
  * CDDL HEADER START
  *
- * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
- * You can obtain a copy of the license at:
- *      http://www.opensolaris.org/os/licensing.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
- * When using or redistributing this file, you may do so under the
- * License only. No other modification of this header is permitted.
- *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
@@ -22,15 +20,16 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms of the CDDL.
+ * Copyright(c) 2007-2008 Intel Corporation. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
 
 #include "ixgbe_sw.h"
 
-static boolean_t ixgbe_tx(ixgbe_tx_ring_t *, mblk_t *);
 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
     uint32_t, boolean_t);
 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
@@ -44,7 +43,7 @@ static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
     ixgbe_tx_context_t *);
 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
-    ixgbe_tx_context_t *);
+    ixgbe_tx_context_t *, int);
 
 #ifndef IXGBE_DEBUG
 #pragma inline(ixgbe_save_desc)
@@ -54,65 +53,9 @@ static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
 #endif
 
 /*
- * ixgbe_m_tx
- *
- * The GLDv3 interface to call driver's tx routine to transmit
- * the mblks.
- */
-mblk_t *
-ixgbe_m_tx(void *arg, mblk_t *mp)
-{
-	ixgbe_t *ixgbe = (ixgbe_t *)arg;
-	mblk_t *next;
-	ixgbe_tx_ring_t *tx_ring;
-
-	/*
-	 * If the adapter is suspended, or it is not started, or the link
-	 * is not up, the mblks are simply dropped.
-	 */
-	if (((ixgbe->ixgbe_state & IXGBE_SUSPENDED) != 0) ||
-	    ((ixgbe->ixgbe_state & IXGBE_STARTED) == 0) ||
-	    (ixgbe->link_state != LINK_STATE_UP)) {
-		/* Free the mblk chain */
-		while (mp != NULL) {
-			next = mp->b_next;
-			mp->b_next = NULL;
-
-			freemsg(mp);
-			mp = next;
-		}
-
-		return (NULL);
-	}
-
-	/*
-	 * Decide which tx ring is used to transmit the packets.
-	 * This needs to be updated later to fit the new interface
-	 * of the multiple rings support.
-	 */
-	tx_ring = &ixgbe->tx_rings[0];
-
-	while (mp != NULL) {
-		next = mp->b_next;
-		mp->b_next = NULL;
-
-		if (!ixgbe_tx(tx_ring, mp)) {
-			mp->b_next = next;
-			break;
-		}
-
-		mp = next;
-	}
-
-	return (mp);
-}
-
-/*
- * ixgbe_tx - Main transmit processing
+ * ixgbe_ring_tx
  *
- * Called from ixgbe_m_tx with an mblk ready to transmit. this
- * routine sets up the transmit descriptors and sends data to
- * the wire.
+ * To transmit one mblk through one specified ring.
  *
  * One mblk can consist of several fragments, each fragment
  * will be processed with different methods based on the size.
@@ -136,9 +79,10 @@ ixgbe_m_tx(void *arg, mblk_t *mp)
  * be used. After the processing, those tx control blocks will
  * be put to the work list.
  */
-static boolean_t
-ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
+mblk_t *
+ixgbe_ring_tx(void *arg, mblk_t *mp)
 {
+	ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
 	ixgbe_t *ixgbe = tx_ring->ixgbe;
 	tx_type_t current_flag, next_flag;
 	uint32_t current_len, next_len;
@@ -150,11 +94,19 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
 	tx_control_block_t *tcb;
 	ixgbe_tx_context_t tx_context, *ctx;
 	link_list_t pending_list;
+	uint32_t len, hdr_frag_len, hdr_len;
+	uint32_t copy_thresh;
+	mblk_t *new_mp;
+	mblk_t *pre_mp;
+
+	ASSERT(mp->b_next == NULL);
+
+	copy_thresh = tx_ring->copy_thresh;
 
 	/* Get the mblk size */
 	mbsize = 0;
 	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
-		mbsize += MBLK_LEN(nmp);
+		mbsize += MBLKL(nmp);
 	}
 
 	if (ixgbe->tx_hcksum_enable) {
@@ -166,25 +118,24 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
 		ctx = &tx_context;
 		if (ixgbe_get_context(mp, ctx) < 0) {
 			freemsg(mp);
-			return (B_TRUE);
+			return (NULL);
 		}
 
 		/*
 		 * If the mblk size exceeds the max size ixgbe could
-		 * process, then discard this mblk, and return B_TRUE
+		 * process, then discard this mblk, and return NULL.
 		 */
 		if ((ctx->lso_flag && ((mbsize - ctx->mac_hdr_len)
 		    > IXGBE_LSO_MAXLEN)) || (!ctx->lso_flag &&
 		    (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
 			freemsg(mp);
 			IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
-			return (B_TRUE);
+			return (NULL);
 		}
 	} else {
 		ctx = NULL;
 	}
 
-
 	/*
 	 * Check and recycle tx descriptors.
 	 * The recycle threshold here should be selected carefully
@@ -194,13 +145,13 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
 
 	/*
 	 * After the recycling, if the tbd_free is less than the
-	 * overload_threshold, assert overload, return B_FALSE;
+	 * overload_threshold, assert overload, return mp;
 	 * and we need to re-schedule the tx again.
 	 */
 	if (tx_ring->tbd_free < tx_ring->overload_thresh) {
 		tx_ring->reschedule = B_TRUE;
 		IXGBE_DEBUG_STAT(tx_ring->stat_overload);
-		return (B_FALSE);
+		return (mp);
 	}
 
 	/*
@@ -213,12 +164,77 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
 	desc_num = 0;
 	desc_total = 0;
 
+	/*
+	 * The software should guarantee LSO packet header(MAC+IP+TCP)
+	 * to be within one descriptor. Here we reallocate and refill the
+	 * the header if it's physical memory non-contiguous.
+	 */
+	if ((ctx != NULL) && ctx->lso_flag) {
+		/* find the last fragment of the header */
+		len = MBLKL(mp);
+		ASSERT(len > 0);
+		nmp = mp;
+		pre_mp = NULL;
+		hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
+		while (len < hdr_len) {
+			pre_mp = nmp;
+			nmp = nmp->b_cont;
+			len += MBLKL(nmp);
+		}
+		/*
+		 * If the header and the payload are in different mblks,
+		 * we simply force the header to be copied into pre-allocated
+		 * page-aligned buffer.
+		 */
+		if (len == hdr_len)
+			goto adjust_threshold;
+
+		hdr_frag_len = hdr_len - (len - MBLKL(nmp));
+		/*
+		 * There are two cases we need to reallocate a mblk for the
+		 * last header fragment:
+		 * 1. the header is in multiple mblks and the last fragment
+		 * share the same mblk with the payload
+		 * 2. the header is in a single mblk shared with the payload
+		 * and the header is physical memory non-contiguous
+		 */
+		if ((nmp != mp) ||
+		    (P2NPHASE((uintptr_t)nmp->b_rptr, ixgbe->sys_page_size)
+		    < len)) {
+			IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail);
+			/*
+			 * reallocate the mblk for the last header fragment,
+			 * expect to bcopy into pre-allocated page-aligned
+			 * buffer
+			 */
+			new_mp = allocb(hdr_frag_len, NULL);
+			if (!new_mp)
+				return (B_FALSE);
+			bcopy(nmp->b_rptr, new_mp->b_rptr, hdr_frag_len);
+			/* link the new header fragment with the other parts */
+			new_mp->b_wptr = new_mp->b_rptr + hdr_frag_len;
+			new_mp->b_cont = nmp;
+			if (pre_mp)
+				pre_mp->b_cont = new_mp;
+			nmp->b_rptr += hdr_frag_len;
+			if (hdr_frag_len == hdr_len)
+				mp = new_mp;
+		}
+adjust_threshold:
+		/*
+		 * adjust the bcopy threshhold to guarantee
+		 * the header to use bcopy way
+		 */
+		if (copy_thresh < hdr_len)
+			copy_thresh = hdr_len;
+	}
+
 	current_mp = mp;
-	current_len = MBLK_LEN(current_mp);
+	current_len = MBLKL(current_mp);
 	/*
 	 * Decide which method to use for the first fragment
 	 */
-	current_flag = (current_len <= tx_ring->copy_thresh) ?
+	current_flag = (current_len <= copy_thresh) ?
 	    USE_COPY : USE_DMA;
 	/*
 	 * If the mblk includes several contiguous small fragments,
@@ -238,7 +254,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
 	while (current_mp) {
 		next_mp = current_mp->b_cont;
 		eop = (next_mp == NULL); /* Last fragment of the packet? */
-		next_len = eop ? 0: MBLK_LEN(next_mp);
+		next_len = eop ? 0: MBLKL(next_mp);
 
 		/*
 		 * When the current fragment is an empty fragment, if
@@ -254,7 +270,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
 		if ((current_len == 0) && (copy_done)) {
 			current_mp = next_mp;
 			current_len = next_len;
-			current_flag = (current_len <= tx_ring->copy_thresh) ?
+			current_flag = (current_len <= copy_thresh) ?
 			    USE_COPY : USE_DMA;
 			continue;
 		}
@@ -302,10 +318,10 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
 				 * copied to the current tx buffer, we need
 				 * to complete the current copy processing.
 				 */
-				next_flag = (next_len > tx_ring->copy_thresh) ?
+				next_flag = (next_len > copy_thresh) ?
 				    USE_DMA: USE_COPY;
 				copy_done = B_TRUE;
-			} else if (next_len > tx_ring->copy_thresh) {
+			} else if (next_len > copy_thresh) {
 				/*
 				 * The next fragment needs to be processed with
 				 * DMA binding. So the copy prcessing will be
@@ -329,7 +345,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
 			 * Check whether to use bcopy or DMA binding to process
 			 * the next fragment.
 			 */
-			next_flag = (next_len > tx_ring->copy_thresh) ?
+			next_flag = (next_len > copy_thresh) ?
 			    USE_DMA: USE_COPY;
 			ASSERT(copy_done == B_TRUE);
 
@@ -367,7 +383,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
 
 	/*
 	 * If the number of free tx descriptors is not enough for transmit
-	 * then return failure.
+	 * then return mp.
 	 *
 	 * Note: we must put this check under the mutex protection to
 	 * ensure the correctness when multiple threads access it in
@@ -386,7 +402,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp)
 
 	mutex_exit(&tx_ring->tx_lock);
 
-	return (B_TRUE);
+	return (NULL);
 
 tx_failure:
 	/*
@@ -410,7 +426,7 @@ tx_failure:
 	/* Transmit failed, do not drop the mblk, rechedule the transmit */
 	tx_ring->reschedule = B_TRUE;
 
-	return (B_FALSE);
+	return (mp);
 }
 
 /*
@@ -536,7 +552,9 @@ static int
 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 {
 	uint32_t start;
-	uint32_t flags;
+	uint32_t hckflags;
+	uint32_t lsoflags;
+	uint32_t mss;
 	uint32_t len;
 	uint32_t size;
 	uint32_t offset;
@@ -548,16 +566,16 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 
 	ASSERT(mp != NULL);
 
-	hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &flags);
+	hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &hckflags);
 	bzero(ctx, sizeof (ixgbe_tx_context_t));
-	ctx->hcksum_flags = flags;
 
-	if (flags == 0)
+	if (hckflags == 0)
 		return (0);
+	ctx->hcksum_flags = hckflags;
 
-	ctx->mss = DB_LSOMSS(mp);
-	ctx->lso_flag = (ctx->hcksum_flags & HW_LSO) &&
-	    (ctx->mss != 0);
+	lso_info_get(mp, &mss, &lsoflags);
+	ctx->mss = mss;
+	ctx->lso_flag = (lsoflags == HW_LSO);
 
 	/*
 	 * LSO relies on tx h/w checksum, so here will drop the package
@@ -582,12 +600,12 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 	 * in one mblk fragment, so we go thourgh the fragments to parse
 	 * the ether type.
 	 */
-	size = len = MBLK_LEN(mp);
+	size = len = MBLKL(mp);
 	offset = offsetof(struct ether_header, ether_type);
 	while (size <= offset) {
 		mp = mp->b_cont;
 		ASSERT(mp != NULL);
-		len = MBLK_LEN(mp);
+		len = MBLKL(mp);
 		size += len;
 	}
 	pos = mp->b_rptr + offset + len - size;
@@ -601,7 +619,7 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 		while (size <= offset) {
 			mp = mp->b_cont;
 			ASSERT(mp != NULL);
-			len = MBLK_LEN(mp);
+			len = MBLKL(mp);
 			size += len;
 		}
 		pos = mp->b_rptr + offset + len - size;
@@ -613,25 +631,32 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 	}
 
 	/*
-	 * Here we assume the IP(V6) header is fully included in
+	 * Here we don't assume the IP(V6) header is fully included in
 	 * one mblk fragment.
 	 */
 	switch (etype) {
 	case ETHERTYPE_IP:
-		offset = mac_hdr_len;
-		while (size <= offset) {
-			mp = mp->b_cont;
-			ASSERT(mp != NULL);
-			len = MBLK_LEN(mp);
-			size += len;
-		}
-		pos = mp->b_rptr + offset + len - size;
-
 		if (ctx->lso_flag) {
-			*((uint16_t *)(uintptr_t)(pos + offsetof(ipha_t,
-			    ipha_length))) = 0;
-			*((uint16_t *)(uintptr_t)(pos + offsetof(ipha_t,
-			    ipha_hdr_checksum))) = 0;
+			offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
+			while (size <= offset) {
+				mp = mp->b_cont;
+				ASSERT(mp != NULL);
+				len = MBLKL(mp);
+				size += len;
+			}
+			pos = mp->b_rptr + offset + len - size;
+			*((uint16_t *)(uintptr_t)(pos)) = 0;
+
+			offset = offsetof(ipha_t, ipha_hdr_checksum) +
+			    mac_hdr_len;
+			while (size <= offset) {
+				mp = mp->b_cont;
+				ASSERT(mp != NULL);
+				len = MBLKL(mp);
+				size += len;
+			}
+			pos = mp->b_rptr + offset + len - size;
+			*((uint16_t *)(uintptr_t)(pos)) = 0;
 
 			/*
 			 * To perform ixgbe LSO, here also need to fill
@@ -642,14 +667,23 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 			 */
 		}
 
-		l4_proto = *(uint8_t *)(pos + offsetof(ipha_t, ipha_protocol));
+		offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
+		while (size <= offset) {
+			mp = mp->b_cont;
+			ASSERT(mp != NULL);
+			len = MBLKL(mp);
+			size += len;
+		}
+		pos = mp->b_rptr + offset + len - size;
+
+		l4_proto = *(uint8_t *)pos;
 		break;
 	case ETHERTYPE_IPV6:
 		offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
 		while (size <= offset) {
 			mp = mp->b_cont;
 			ASSERT(mp != NULL);
-			len = MBLK_LEN(mp);
+			len = MBLKL(mp);
 			size += len;
 		}
 		pos = mp->b_rptr + offset + len - size;
@@ -667,7 +701,7 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
 		while (size <= offset) {
 			mp = mp->b_cont;
 			ASSERT(mp != NULL);
-			len = MBLK_LEN(mp);
+			len = MBLKL(mp);
 			size += len;
 		}
 		pos = mp->b_rptr + offset + len - size;
@@ -702,13 +736,14 @@ ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
 		return (B_FALSE);
 
 	/*
-	 * Compare the checksum data retrieved from the mblk and the
-	 * stored checksum data of the last context descriptor. The data
-	 * need to be checked are:
+	 * Compare the context data retrieved from the mblk and the
+	 * stored data of the last context descriptor. The data need
+	 * to be checked are:
 	 *	hcksum_flags
 	 *	l4_proto
 	 *	mac_hdr_len
 	 *	ip_hdr_len
+	 *	lso_flag
 	 *	mss (only checked for LSO)
 	 *	l4_hr_len (only checked for LSO)
 	 * Either one of the above data is changed, a new context descriptor
@@ -716,16 +751,14 @@ ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
 	 */
 	last = &tx_ring->tx_context;
 
-	if (ctx->hcksum_flags != 0) {
-		if ((ctx->hcksum_flags != last->hcksum_flags) ||
-		    (ctx->l4_proto != last->l4_proto) ||
-		    (ctx->mac_hdr_len != last->mac_hdr_len) ||
-		    (ctx->ip_hdr_len != last->ip_hdr_len) ||
-		    (ctx->lso_flag && ((ctx->mss != last->mss) ||
-		    (ctx->l4_hdr_len != last->l4_hdr_len)))) {
-
-			return (B_TRUE);
-		}
+	if ((ctx->hcksum_flags != last->hcksum_flags) ||
+	    (ctx->l4_proto != last->l4_proto) ||
+	    (ctx->mac_hdr_len != last->mac_hdr_len) ||
+	    (ctx->ip_hdr_len != last->ip_hdr_len) ||
+	    (ctx->lso_flag != last->lso_flag) ||
+	    (ctx->lso_flag && ((ctx->mss != last->mss) ||
+	    (ctx->l4_hdr_len != last->l4_hdr_len)))) {
+		return (B_TRUE);
 	}
 
 	return (B_FALSE);
@@ -738,11 +771,11 @@ ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
  */
 static void
 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
-    ixgbe_tx_context_t *ctx)
+    ixgbe_tx_context_t *ctx, int ring_index)
 {
 	/*
 	 * Fill the context descriptor with the checksum
-	 * context information we've got
+	 * context information we've got.
 	 */
 	ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
 	ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
@@ -775,12 +808,12 @@ ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
 	}
 
 	ctx_tbd->seqnum_seed = 0;
+	ctx_tbd->mss_l4len_idx = ring_index << 4;
+
 	if (ctx->lso_flag) {
-		ctx_tbd->mss_l4len_idx =
+		ctx_tbd->mss_l4len_idx |=
 		    (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
 		    (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
-	} else {
-		ctx_tbd->mss_l4len_idx = 0;
 	}
 }
 
@@ -838,7 +871,7 @@ ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
 			 */
 			ixgbe_fill_context(
 			    (struct ixgbe_adv_tx_context_desc *)tbd,
-			    ctx);
+			    ctx, tx_ring->index);
 
 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
 			desc_num++;
@@ -908,6 +941,14 @@ ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
 	 */
 	ASSERT(first_tbd != NULL);
 	first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
+	first_tbd->read.olinfo_status |= (tx_ring->index << 4);
+
+	if (ctx != NULL && ctx->lso_flag) {
+		first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
+		first_tbd->read.olinfo_status |=
+		    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
+		    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
+	}
 
 	if (ctx != NULL && ctx->lso_flag) {
 		first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
@@ -1017,14 +1058,18 @@ ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
 	 * The mutex_tryenter() is used to avoid unnecessary
 	 * lock contention.
 	 */
-	if (mutex_tryenter(&tx_ring->recycle_lock) == 0)
-		return (0);
+	mutex_enter(&tx_ring->recycle_lock);
 
 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
 
 	if (tx_ring->tbd_free == tx_ring->ring_size) {
 		tx_ring->recycle_fail = 0;
 		tx_ring->stall_watchdog = 0;
+		if (tx_ring->reschedule) {
+			tx_ring->reschedule = B_FALSE;
+			mac_tx_ring_update(tx_ring->ixgbe->mac_hdl,
+			    tx_ring->ring_handle);
+		}
 		mutex_exit(&tx_ring->recycle_lock);
 		return (0);
 	}
@@ -1108,6 +1153,12 @@ ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
 	 */
 	atomic_add_32(&tx_ring->tbd_free, desc_num);
 
+	if ((tx_ring->tbd_free >= tx_ring->resched_thresh) &&
+	    (tx_ring->reschedule)) {
+		tx_ring->reschedule = B_FALSE;
+		mac_tx_ring_update(tx_ring->ixgbe->mac_hdl,
+		    tx_ring->ring_handle);
+	}
 	mutex_exit(&tx_ring->recycle_lock);
 
 	/*
@@ -1152,14 +1203,18 @@ ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
 	 * The mutex_tryenter() is used to avoid unnecessary
 	 * lock contention.
 	 */
-	if (mutex_tryenter(&tx_ring->recycle_lock) == 0)
-		return (0);
+	mutex_enter(&tx_ring->recycle_lock);
 
 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
 
 	if (tx_ring->tbd_free == tx_ring->ring_size) {
 		tx_ring->recycle_fail = 0;
 		tx_ring->stall_watchdog = 0;
+		if (tx_ring->reschedule) {
+			tx_ring->reschedule = B_FALSE;
+			mac_tx_ring_update(tx_ring->ixgbe->mac_hdl,
+			    tx_ring->ring_handle);
+		}
 		mutex_exit(&tx_ring->recycle_lock);
 		return (0);
 	}
@@ -1245,6 +1300,12 @@ ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
 	 */
 	atomic_add_32(&tx_ring->tbd_free, desc_num);
 
+	if ((tx_ring->tbd_free >= tx_ring->resched_thresh) &&
+	    (tx_ring->reschedule)) {
+		tx_ring->reschedule = B_FALSE;
+		mac_tx_ring_update(tx_ring->ixgbe->mac_hdl,
+		    tx_ring->ring_handle);
+	}
 	mutex_exit(&tx_ring->recycle_lock);
 
 	/*
diff --git a/usr/src/uts/common/io/mac/README b/usr/src/uts/common/io/mac/README
new file mode 100644
index 0000000000..744c9842c3
--- /dev/null
+++ b/usr/src/uts/common/io/mac/README
@@ -0,0 +1,80 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#
+
+This README describes the organization of the files and subdirectories
+that make up the misc/mac module.
+
+Changes to the sources should follow the layout and naming conventions
+adopted herein.
+
+Each functional component of the mac module is implemented in a separate
+source file. The external interfaces are declared in header files delivered
+under <sys>. The internal data structures and definitions are declared
+in header files internal to this directory.
+
+. Client Interface
+    This is the kernel programming interface for accessing L2 services as
+    a consumer.
+	. mac_client.c
+	. sys/mac_client.h:		APIs intended for external MAC consumers
+	. sys/mac_client_priv.h:	APIs for GLDv3 components only (dld,
+					dls, aggr, vnic, etc).
+	. mac_client_impl.h		Internals.
+
+. Provider Interface
+    This is the GLDv3 kernel driver interface. Functions and data structures
+    are used by L2 drivers to provide services to MAC consumers.
+	. mac_provider.c
+	. sys/mac_provider.h
+
+. MAC Type Plugins
+    The GLDv3 L2 supports multiple types of media control. Each type is
+    implemented as a plugin delivered in a separate file under the
+    plugin/ directory.
+    Add a new file to the plugin/ directory for introducing a new MAC type.
+
+. Core Component.
+    - Scheduling Engine:
+	. mac_datapath_setup.c:	Control path for the scheduler.
+	. mac_soft_ring.c,
+	  mac_soft_ring.h:	Fanout Soft Rings.
+	. mac_sched.c:		Data path
+	. mac_bcast.c		Data path and switching for broadcast and
+				multicast packets.
+	. mac_stat.c:		Statistics
+
+    - Classification Engine
+	mac_flow.c:		Flows and software classification:
+
+    - NICs Resources Management
+	. mac.c (this file also has other miscelanea)
+
+. Misc
+	. mac.c
+	. mac_util.c
+	. mac_ndd.c
+
diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index a7c472bfb2..1ee6d36cd6 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -24,9 +24,246 @@
  * Use is subject to license terms.
  */
 
-
 /*
  * MAC Services Module
+ *
+ * The GLDv3 framework locking -  The MAC layer
+ * --------------------------------------------
+ *
+ * The MAC layer is central to the GLD framework and can provide the locking
+ * framework needed for itself and for the use of MAC clients. MAC end points
+ * are fairly disjoint and don't share a lot of state. So a coarse grained
+ * multi-threading scheme is to single thread all create/modify/delete or set
+ * type of control operations on a per mac end point while allowing data threads
+ * concurrently.
+ *
+ * Control operations (set) that modify a mac end point are always serialized on
+ * a per mac end point basis, We have at most 1 such thread per mac end point
+ * at a time.
+ *
+ * All other operations that are not serialized are essentially multi-threaded.
+ * For example a control operation (get) like getting statistics which may not
+ * care about reading values atomically or data threads sending or receiving
+ * data. Mostly these type of operations don't modify the control state. Any
+ * state these operations care about are protected using traditional locks.
+ *
+ * The perimeter only serializes serial operations. It does not imply there
+ * aren't any other concurrent operations. However a serialized operation may
+ * sometimes need to make sure it is the only thread. In this case it needs
+ * to use reference counting mechanisms to cv_wait until any current data
+ * threads are done.
+ *
+ * The mac layer itself does not hold any locks across a call to another layer.
+ * The perimeter is however held across a down call to the driver to make the
+ * whole control operation atomic with respect to other control operations.
+ * Also the data path and get type control operations may proceed concurrently.
+ * These operations synchronize with the single serial operation on a given mac
+ * end point using regular locks. The perimeter ensures that conflicting
+ * operations like say a mac_multicast_add and a mac_multicast_remove on the
+ * same mac end point don't interfere with each other and also ensures that the
+ * changes in the mac layer and the call to the underlying driver to say add a
+ * multicast address are done atomically without interference from a thread
+ * trying to delete the same address.
+ *
+ * For example, consider
+ * mac_multicst_add()
+ * {
+ *	mac_perimeter_enter();	serialize all control operations
+ *
+ *	grab list lock		protect against access by data threads
+ *	add to list
+ *	drop list lock
+ *
+ *	call driver's mi_multicst
+ *
+ *	mac_perimeter_exit();
+ * }
+ *
+ * To lessen the number of serialization locks and simplify the lock hierarchy,
+ * we serialize all the control operations on a per mac end point by using a
+ * single serialization lock called the perimeter. We allow recursive entry into
+ * the perimeter to facilitate use of this mechanism by both the mac client and
+ * the MAC layer itself.
+ *
+ * MAC client means an entity that does an operation on a mac handle
+ * obtained from a mac_open/mac_client_open. Similarly MAC driver means
+ * an entity that does an operation on a mac handle obtained from a
+ * mac_register. An entity could be both client and driver but on different
+ * handles eg. aggr. and should only make the corresponding mac interface calls
+ * i.e. mac driver interface or mac client interface as appropriate for that
+ * mac handle.
+ *
+ * General rules.
+ * -------------
+ *
+ * R1. The lock order of upcall threads is natually opposite to downcall
+ * threads. Hence upcalls must not hold any locks across layers for fear of
+ * recursive lock enter and lock order violation. This applies to all layers.
+ *
+ * R2. The perimeter is just another lock. Since it is held in the down
+ * direction, acquiring the perimeter in an upcall is prohibited as it would
+ * cause a deadlock. This applies to all layers.
+ *
+ * Note that upcalls that need to grab the mac perimeter (for example
+ * mac_notify upcalls) can still achieve that by posting the request to a
+ * thread, which can then grab all the required perimeters and locks in the
+ * right global order. Note that in the above example the mac layer iself
+ * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
+ * to the client must do that. Please see the aggr code for an example.
+ *
+ * MAC client rules
+ * ----------------
+ *
+ * R3. A MAC client may use the MAC provided perimeter facility to serialize
+ * control operations on a per mac end point. It does this by by acquring
+ * and holding the perimeter across a sequence of calls to the mac layer.
+ * This ensures atomicity across the entire block of mac calls. In this
+ * model the MAC client must not hold any client locks across the calls to
+ * the mac layer. This model is the preferred solution.
+ *
+ * R4. However if a MAC client has a lot of global state across all mac end
+ * points the per mac end point serialization may not be sufficient. In this
+ * case the client may choose to use global locks or use its own serialization.
+ * To avoid deadlocks, these client layer locks held across the mac calls
+ * in the control path must never be acquired by the data path for the reason
+ * mentioned below.
+ *
+ * (Assume that a control operation that holds a client lock blocks in the
+ * mac layer waiting for upcall reference counts to drop to zero. If an upcall
+ * data thread that holds this reference count, tries to acquire the same
+ * client lock subsequently it will deadlock).
+ *
+ * A MAC client may follow either the R3 model or the R4 model, but can't
+ * mix both. In the former, the hierarchy is Perim -> client locks, but in
+ * the latter it is client locks -> Perim.
+ *
+ * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
+ * context since they may block while trying to acquire the perimeter.
+ * In addition some calls may block waiting for upcall refcnts to come down to
+ * zero.
+ *
+ * R6. MAC clients must make sure that they are single threaded and all threads
+ * from the top (in particular data threads) have finished before calling
+ * mac_client_close. The MAC framework does not track the number of client
+ * threads using the mac client handle. Also mac clients must make sure
+ * they have undone all the control operations before calling mac_client_close.
+ * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
+ * mac_unicast_add/mac_multicast_add.
+ *
+ * MAC framework rules
+ * -------------------
+ *
+ * R7. The mac layer itself must not hold any mac layer locks (except the mac
+ * perimeter) across a call to any other layer from the mac layer. The call to
+ * any other layer could be via mi_* entry points, classifier entry points into
+ * the driver or via upcall pointers into layers above. The mac perimeter may
+ * be acquired or held only in the down direction, for e.g. when calling into
+ * a mi_* driver enty point to provide atomicity of the operation.
+ *
+ * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
+ * mac driver interfaces, the MAC layer must provide a cut out for control
+ * interfaces like upcall notifications and start them in a separate thread.
+ *
+ * R9. Note that locking order also implies a plumbing order. For example
+ * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
+ * to plumb in any other order must be failed at mac_open time, otherwise it
+ * could lead to deadlocks due to inverse locking order.
+ *
+ * R10. MAC driver interfaces must not block since the driver could call them
+ * in interrupt context.
+ *
+ * R11. Walkers must preferably not hold any locks while calling walker
+ * callbacks. Instead these can operate on reference counts. In simple
+ * callbacks it may be ok to hold a lock and call the callbacks, but this is
+ * harder to maintain in the general case of arbitrary callbacks.
+ *
+ * R12. The MAC layer must protect upcall notification callbacks using reference
+ * counts rather than holding locks across the callbacks.
+ *
+ * R13. Given the variety of drivers, it is preferable if the MAC layer can make
+ * sure that any pointers (such as mac ring pointers) it passes to the driver
+ * remain valid until mac unregister time. Currently the mac layer achieves
+ * this by using generation numbers for rings and freeing the mac rings only
+ * at unregister time.  The MAC layer must provide a layer of indirection and
+ * must not expose underlying driver rings or driver data structures/pointers
+ * directly to MAC clients.
+ *
+ * MAC driver rules
+ * ----------------
+ *
+ * R14. It would be preferable if MAC drivers don't hold any locks across any
+ * mac call. However at a minimum they must not hold any locks across data
+ * upcalls. They must also make sure that all references to mac data structures
+ * are cleaned up and that it is single threaded at mac_unregister time.
+ *
+ * R15. MAC driver interfaces don't block and so the action may be done
+ * asynchronously in a separate thread as for example handling notifications.
+ * The driver must not assume that the action is complete when the call
+ * returns.
+ *
+ * R16. Drivers must maintain a generation number per Rx ring, and pass it
+ * back to mac_rx_ring(); They are expected to increment the generation
+ * number whenever the ring's stop routine is invoked.
+ * See comments in mac_rx_ring();
+ *
+ * R17 Similarly mi_stop is another synchronization point and the driver must
+ * ensure that all upcalls are done and there won't be any future upcall
+ * before returning from mi_stop.
+ *
+ * R18. The driver may assume that all set/modify control operations via
+ * the mi_* entry points are single threaded on a per mac end point.
+ *
+ * Lock and Perimeter hierarchy scenarios
+ * ---------------------------------------
+ *
+ * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
+ *
+ * ft_lock -> fe_lock [mac_flow_lookup]
+ *
+ * mi_rw_lock -> fe_lock [mac_bcast_send]
+ *
+ * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
+ *
+ * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
+ *
+ * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
+ *
+ * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
+ * client to driver. In the case of clients that explictly use the mac provided
+ * perimeter mechanism for its serialization, the hierarchy is
+ * Perimeter -> mac layer locks, since the client never holds any locks across
+ * the mac calls. In the case of clients that use its own locks the hierarchy
+ * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
+ * calls mac_perim_enter/exit in this case.
+ *
+ * Subflow creation rules
+ * ---------------------------
+ * o In case of a user specified cpulist present on underlying link and flows,
+ * the flows cpulist must be a subset of the underlying link.
+ * o In case of a user specified fanout mode present on link and flow, the
+ * subflow fanout count has to be less than or equal to that of the
+ * underlying link. The cpu-bindings for the subflows will be a subset of
+ * the underlying link.
+ * o In case if no cpulist specified on both underlying link and flow, the
+ * underlying link relies on a  MAC tunable to provide out of box fanout.
+ * The subflow will have no cpulist (the subflow will be unbound)
+ * o In case if no cpulist is specified on the underlying link, a subflow can
+ * carry  either a user-specified cpulist or fanout count. The cpu-bindings
+ * for the subflow will not adhere to restriction that they need to be subset
+ * of the underlying link.
+ * o In case where the underlying link is carrying either a user specified
+ * cpulist or fanout mode and for a unspecified subflow, the subflow will be
+ * created unbound.
+ * o While creating unbound subflows, bandwidth mode changes attempt to
+ * figure a right fanout count. In such cases the fanout count will override
+ * the unbound cpu-binding behavior.
+ * o In addition to this, while cycling between flow and link properties, we
+ * impose a restriction that if a link property has a subflow with
+ * user-specified attributes, we will not allow changing the link property.
+ * The administrator needs to reset all the user specified properties for the
+ * subflows before attempting a link property change.
+ * Some of the above rules can be overridden by specifying additional command
+ * line options while creating or modifying link or subflow properties.
  */
 
 #include <sys/types.h>
@@ -39,11 +276,13 @@
 #include <sys/strsun.h>
 #include <sys/strsubr.h>
 #include <sys/dlpi.h>
-#include <sys/dls.h>
 #include <sys/modhash.h>
-#include <sys/vlan.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_soft_ring.h>
 #include <sys/mac_impl.h>
+#include <sys/mac.h>
+#include <sys/dls.h>
 #include <sys/dld.h>
 #include <sys/modctl.h>
 #include <sys/fs/dv_node.h>
@@ -52,20 +291,45 @@
 #include <sys/callb.h>
 #include <sys/cpuvar.h>
 #include <sys/atomic.h>
+#include <sys/bitmap.h>
+#include <sys/sdt.h>
+#include <sys/mac_flow.h>
+#include <sys/ddi_intr_impl.h>
+#include <sys/disp.h>
 #include <sys/sdt.h>
+#include <sys/vnic.h>
+#include <sys/vnic_impl.h>
+#include <sys/vlan.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <sys/exacct.h>
+#include <sys/exacct_impl.h>
 #include <inet/nd.h>
 #include <sys/ethernet.h>
 
 #define	IMPL_HASHSZ	67	/* prime */
 
-static kmem_cache_t	*i_mac_impl_cachep;
-static mod_hash_t	*i_mac_impl_hash;
+kmem_cache_t	*i_mac_impl_cachep;
+mod_hash_t		*i_mac_impl_hash;
 krwlock_t		i_mac_impl_lock;
 uint_t			i_mac_impl_count;
-static kmem_cache_t	*mac_vnic_tx_cache;
+static kmem_cache_t	*mac_ring_cache;
 static id_space_t	*minor_ids;
 static uint32_t		minor_count;
 
+/*
+ * Logging stuff. Perhaps mac_logging_interval could be broken into
+ * mac_flow_log_interval and mac_link_log_interval if we want to be
+ * able to schedule them differently.
+ */
+uint_t			mac_logging_interval;
+boolean_t		mac_flow_log_enable;
+boolean_t		mac_link_log_enable;
+timeout_id_t		mac_logging_timer;
+
+/* for debugging, see MAC_DBG_PRT() in mac_impl.h */
+int mac_dbg = 0;
+
 #define	MACTYPE_KMODDIR	"mac"
 #define	MACTYPE_HASHSZ	67
 static mod_hash_t	*i_mactype_hash;
@@ -75,295 +339,75 @@ static mod_hash_t	*i_mactype_hash;
  */
 static kmutex_t		i_mactype_lock;
 
-static void i_mac_notify_thread(void *);
-static mblk_t *mac_vnic_tx(void *, mblk_t *);
-static mblk_t *mac_vnic_txloop(void *, mblk_t *);
-static void   mac_register_priv_prop(mac_impl_t *, mac_priv_prop_t *, uint_t);
-static void   mac_unregister_priv_prop(mac_impl_t *);
-
 /*
- * Private functions.
+ * mac_tx_percpu_cnt
+ *
+ * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
+ * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
+ * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
+ * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
  */
-
-/*ARGSUSED*/
-static int
-i_mac_constructor(void *buf, void *arg, int kmflag)
-{
-	mac_impl_t	*mip = buf;
-
-	bzero(buf, sizeof (mac_impl_t));
-
-	mip->mi_linkstate = LINK_STATE_UNKNOWN;
-
-	rw_init(&mip->mi_state_lock, NULL, RW_DRIVER, NULL);
-	rw_init(&mip->mi_gen_lock, NULL, RW_DRIVER, NULL);
-	rw_init(&mip->mi_data_lock, NULL, RW_DRIVER, NULL);
-	rw_init(&mip->mi_notify_lock, NULL, RW_DRIVER, NULL);
-	rw_init(&mip->mi_rx_lock, NULL, RW_DRIVER, NULL);
-	rw_init(&mip->mi_tx_lock, NULL, RW_DRIVER, NULL);
-	rw_init(&mip->mi_resource_lock, NULL, RW_DRIVER, NULL);
-	mutex_init(&mip->mi_activelink_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&mip->mi_notify_bits_lock, NULL, MUTEX_DRIVER, NULL);
-	cv_init(&mip->mi_notify_cv, NULL, CV_DRIVER, NULL);
-	mutex_init(&mip->mi_lock, NULL, MUTEX_DRIVER, NULL);
-	cv_init(&mip->mi_rx_cv, NULL, CV_DRIVER, NULL);
-	return (0);
-}
-
-/*ARGSUSED*/
-static void
-i_mac_destructor(void *buf, void *arg)
-{
-	mac_impl_t	*mip = buf;
-
-	ASSERT(mip->mi_ref == 0);
-	ASSERT(!mip->mi_exclusive);
-	ASSERT(mip->mi_active == 0);
-	ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
-	ASSERT(mip->mi_devpromisc == 0);
-	ASSERT(mip->mi_promisc == 0);
-	ASSERT(mip->mi_mmap == NULL);
-	ASSERT(mip->mi_mmrp == NULL);
-	ASSERT(mip->mi_mnfp == NULL);
-	ASSERT(mip->mi_resource_add == NULL);
-	ASSERT(mip->mi_ksp == NULL);
-	ASSERT(mip->mi_kstat_count == 0);
-	ASSERT(mip->mi_notify_bits == 0);
-	ASSERT(mip->mi_notify_thread == NULL);
-
-	rw_destroy(&mip->mi_gen_lock);
-	rw_destroy(&mip->mi_state_lock);
-	rw_destroy(&mip->mi_data_lock);
-	rw_destroy(&mip->mi_notify_lock);
-	rw_destroy(&mip->mi_rx_lock);
-	rw_destroy(&mip->mi_tx_lock);
-	rw_destroy(&mip->mi_resource_lock);
-	mutex_destroy(&mip->mi_activelink_lock);
-	mutex_destroy(&mip->mi_notify_bits_lock);
-	cv_destroy(&mip->mi_notify_cv);
-	mutex_destroy(&mip->mi_lock);
-	cv_destroy(&mip->mi_rx_cv);
-}
+int mac_tx_percpu_cnt;
+int mac_tx_percpu_cnt_max = 128;
+
+static int i_mac_constructor(void *, void *, int);
+static void i_mac_destructor(void *, void *);
+static int i_mac_ring_ctor(void *, void *, int);
+static void i_mac_ring_dtor(void *, void *);
+static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
+void mac_tx_client_flush(mac_client_impl_t *);
+void mac_tx_client_block(mac_client_impl_t *);
+static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
+static int mac_start_group_and_rings(mac_group_t *);
+static void mac_stop_group_and_rings(mac_group_t *);
 
 /*
- * mac_vnic_tx_t kmem cache support functions.
+ * Module initialization functions.
  */
 
-/* ARGSUSED */
-static int
-i_mac_vnic_tx_ctor(void *buf, void *arg, int mkflag)
-{
-	mac_vnic_tx_t *vnic_tx = buf;
-
-	bzero(buf, sizeof (mac_vnic_tx_t));
-	mutex_init(&vnic_tx->mv_lock, NULL, MUTEX_DRIVER, NULL);
-	cv_init(&vnic_tx->mv_cv, NULL, CV_DRIVER, NULL);
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-i_mac_vnic_tx_dtor(void *buf, void *arg)
-{
-	mac_vnic_tx_t *vnic_tx = buf;
-
-	ASSERT(vnic_tx->mv_refs == 0);
-	mutex_destroy(&vnic_tx->mv_lock);
-	cv_destroy(&vnic_tx->mv_cv);
-}
-
-static void
-i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
+void
+mac_init(void)
 {
-	rw_enter(&i_mac_impl_lock, RW_READER);
-	if (mip->mi_disabled)
-		goto exit;
-
-	/*
-	 * Guard against incorrect notifications.  (Running a newer
-	 * mac client against an older implementation?)
-	 */
-	if (type >= MAC_NNOTE)
-		goto exit;
+	mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
+	    boot_max_ncpus);
 
-	mutex_enter(&mip->mi_notify_bits_lock);
-	mip->mi_notify_bits |= (1 << type);
-	cv_broadcast(&mip->mi_notify_cv);
-	mutex_exit(&mip->mi_notify_bits_lock);
+	/* Upper bound is mac_tx_percpu_cnt_max */
+	if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
+		mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
 
-exit:
-	rw_exit(&i_mac_impl_lock);
-}
+	if (mac_tx_percpu_cnt < 1) {
+		/* Someone set max_tx_percpu_cnt_max to 0 or less */
+		mac_tx_percpu_cnt = 1;
+	}
 
-static void
-i_mac_log_link_state(mac_impl_t *mip)
-{
+	ASSERT(mac_tx_percpu_cnt >= 1);
+	mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
 	/*
-	 * If no change, then it is not interesting.
+	 * Make it of the form 2**N - 1 in the range
+	 * [0 .. mac_tx_percpu_cnt_max - 1]
 	 */
-	if (mip->mi_lastlinkstate == mip->mi_linkstate)
-		return;
-
-	switch (mip->mi_linkstate) {
-	case LINK_STATE_UP:
-		if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) {
-			char det[200];
-
-			mip->mi_type->mt_ops.mtops_link_details(det,
-			    sizeof (det), (mac_handle_t)mip, mip->mi_pdata);
-
-			cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det);
-		} else {
-			cmn_err(CE_NOTE, "!%s link up", mip->mi_name);
-		}
-		break;
-
-	case LINK_STATE_DOWN:
-		/*
-		 * Only transitions from UP to DOWN are interesting
-		 */
-		if (mip->mi_lastlinkstate != LINK_STATE_UNKNOWN)
-			cmn_err(CE_NOTE, "!%s link down", mip->mi_name);
-		break;
-
-	case LINK_STATE_UNKNOWN:
-		/*
-		 * This case is normally not interesting.
-		 */
-		break;
-	}
-	mip->mi_lastlinkstate = mip->mi_linkstate;
-}
-
-static void
-i_mac_notify_thread(void *arg)
-{
-	mac_impl_t	*mip = arg;
-	callb_cpr_t	cprinfo;
-
-	CALLB_CPR_INIT(&cprinfo, &mip->mi_notify_bits_lock, callb_generic_cpr,
-	    "i_mac_notify_thread");
-
-	mutex_enter(&mip->mi_notify_bits_lock);
-	for (;;) {
-		uint32_t	bits;
-		uint32_t	type;
-
-		bits = mip->mi_notify_bits;
-		if (bits == 0) {
-			CALLB_CPR_SAFE_BEGIN(&cprinfo);
-			cv_wait(&mip->mi_notify_cv, &mip->mi_notify_bits_lock);
-			CALLB_CPR_SAFE_END(&cprinfo, &mip->mi_notify_bits_lock);
-			continue;
-		}
-		mip->mi_notify_bits = 0;
-
-		if ((bits & (1 << MAC_NNOTE)) != 0) {
-			/* request to quit */
-			ASSERT(mip->mi_disabled);
-			break;
-		}
-
-		mutex_exit(&mip->mi_notify_bits_lock);
-
-		/*
-		 * Log link changes.
-		 */
-		if ((bits & (1 << MAC_NOTE_LINK)) != 0)
-			i_mac_log_link_state(mip);
-
-		/*
-		 * Do notification callbacks for each notification type.
-		 */
-		for (type = 0; type < MAC_NNOTE; type++) {
-			mac_notify_fn_t	*mnfp;
-
-			if ((bits & (1 << type)) == 0) {
-				continue;
-			}
-
-			/*
-			 * Walk the list of notifications.
-			 */
-			rw_enter(&mip->mi_notify_lock, RW_READER);
-			for (mnfp = mip->mi_mnfp; mnfp != NULL;
-			    mnfp = mnfp->mnf_nextp) {
-
-				mnfp->mnf_fn(mnfp->mnf_arg, type);
-			}
-			rw_exit(&mip->mi_notify_lock);
-		}
-
-		mutex_enter(&mip->mi_notify_bits_lock);
-	}
-
-	mip->mi_notify_thread = NULL;
-	cv_broadcast(&mip->mi_notify_cv);
-
-	CALLB_CPR_EXIT(&cprinfo);
-
-	thread_exit();
-}
-
-static mactype_t *
-i_mactype_getplugin(const char *pname)
-{
-	mactype_t	*mtype = NULL;
-	boolean_t	tried_modload = B_FALSE;
-
-	mutex_enter(&i_mactype_lock);
+	mac_tx_percpu_cnt--;
 
-find_registered_mactype:
-	if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
-	    (mod_hash_val_t *)&mtype) != 0) {
-		if (!tried_modload) {
-			/*
-			 * If the plugin has not yet been loaded, then
-			 * attempt to load it now.  If modload() succeeds,
-			 * the plugin should have registered using
-			 * mactype_register(), in which case we can go back
-			 * and attempt to find it again.
-			 */
-			if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
-				tried_modload = B_TRUE;
-				goto find_registered_mactype;
-			}
-		}
-	} else {
-		/*
-		 * Note that there's no danger that the plugin we've loaded
-		 * could be unloaded between the modload() step and the
-		 * reference count bump here, as we're holding
-		 * i_mactype_lock, which mactype_unregister() also holds.
-		 */
-		atomic_inc_32(&mtype->mt_ref);
-	}
-
-	mutex_exit(&i_mactype_lock);
-	return (mtype);
-}
-
-/*
- * Module initialization functions.
- */
-
-void
-mac_init(void)
-{
 	i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
 	    sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
 	    NULL, NULL, NULL, 0);
 	ASSERT(i_mac_impl_cachep != NULL);
 
-	mac_vnic_tx_cache = kmem_cache_create("mac_vnic_tx_cache",
-	    sizeof (mac_vnic_tx_t), 0, i_mac_vnic_tx_ctor, i_mac_vnic_tx_dtor,
-	    NULL, NULL, NULL, 0);
-	ASSERT(mac_vnic_tx_cache != NULL);
+	mac_ring_cache = kmem_cache_create("mac_ring_cache",
+	    sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
+	    NULL, NULL, 0);
+	ASSERT(mac_ring_cache != NULL);
 
 	i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
 	    IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
 	rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
+
+	mac_flow_init();
+	mac_soft_ring_init();
+	mac_bcast_init();
+	mac_client_init();
+
 	i_mac_impl_count = 0;
 
 	i_mactype_hash = mod_hash_create_extended("mactype_hash",
@@ -380,6 +424,12 @@ mac_init(void)
 	minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1, MAXMIN32);
 	ASSERT(minor_ids != NULL);
 	minor_count = 0;
+
+	/* Let's default to 20 seconds */
+	mac_logging_interval = 20;
+	mac_flow_log_enable = B_FALSE;
+	mac_link_log_enable = B_FALSE;
+	mac_logging_timer = 0;
 }
 
 int
@@ -389,567 +439,701 @@ mac_fini(void)
 		return (EBUSY);
 
 	id_space_destroy(minor_ids);
+	mac_flow_fini();
 
 	mod_hash_destroy_hash(i_mac_impl_hash);
 	rw_destroy(&i_mac_impl_lock);
 
-	kmem_cache_destroy(i_mac_impl_cachep);
-	kmem_cache_destroy(mac_vnic_tx_cache);
+	mac_client_fini();
+	kmem_cache_destroy(mac_ring_cache);
 
 	mod_hash_destroy_hash(i_mactype_hash);
+	mac_soft_ring_finish();
 	return (0);
 }
 
-/*
- * Client functions.
- */
-
-static int
-mac_hold(const char *macname, mac_impl_t **pmip)
+void
+mac_init_ops(struct dev_ops *ops, const char *name)
 {
-	mac_impl_t	*mip;
-	int		err;
-
-	/*
-	 * Check the device name length to make sure it won't overflow our
-	 * buffer.
-	 */
-	if (strlen(macname) >= MAXNAMELEN)
-		return (EINVAL);
-
-	/*
-	 * Look up its entry in the global hash table.
-	 */
-	rw_enter(&i_mac_impl_lock, RW_WRITER);
-	err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
-	    (mod_hash_val_t *)&mip);
+	dld_init_ops(ops, name);
+}
 
-	if (err != 0) {
-		rw_exit(&i_mac_impl_lock);
-		return (ENOENT);
-	}
+void
+mac_fini_ops(struct dev_ops *ops)
+{
+	dld_fini_ops(ops);
+}
 
-	if (mip->mi_disabled) {
-		rw_exit(&i_mac_impl_lock);
-		return (ENOENT);
-	}
+/*ARGSUSED*/
+static int
+i_mac_constructor(void *buf, void *arg, int kmflag)
+{
+	mac_impl_t	*mip = buf;
 
-	if (mip->mi_exclusive) {
-		rw_exit(&i_mac_impl_lock);
-		return (EBUSY);
-	}
+	bzero(buf, sizeof (mac_impl_t));
 
-	mip->mi_ref++;
-	rw_exit(&i_mac_impl_lock);
+	mip->mi_linkstate = LINK_STATE_UNKNOWN;
+	mip->mi_nclients = 0;
 
-	*pmip = mip;
+	mutex_init(&mip->mi_lock, NULL, MUTEX_DRIVER, NULL);
+	rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
+	mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
+	cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
+	mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
+	cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
 	return (0);
 }
 
+/*ARGSUSED*/
 static void
-mac_rele(mac_impl_t *mip)
+i_mac_destructor(void *buf, void *arg)
 {
-	rw_enter(&i_mac_impl_lock, RW_WRITER);
-	ASSERT(mip->mi_ref != 0);
-	if (--mip->mi_ref == 0)
-		ASSERT(!mip->mi_activelink);
-	rw_exit(&i_mac_impl_lock);
-}
+	mac_impl_t	*mip = buf;
+	mac_cb_info_t	*mcbi;
 
-int
-mac_hold_exclusive(mac_handle_t mh)
-{
-	mac_impl_t	*mip = (mac_impl_t *)mh;
+	ASSERT(mip->mi_ref == 0);
+	ASSERT(mip->mi_active == 0);
+	ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
+	ASSERT(mip->mi_devpromisc == 0);
+	ASSERT(mip->mi_promisc == 0);
+	ASSERT(mip->mi_ksp == NULL);
+	ASSERT(mip->mi_kstat_count == 0);
+	ASSERT(mip->mi_nclients == 0);
+	ASSERT(mip->mi_nactiveclients == 0);
+	ASSERT(mip->mi_state_flags == 0);
+	ASSERT(mip->mi_factory_addr == NULL);
+	ASSERT(mip->mi_factory_addr_num == 0);
+	ASSERT(mip->mi_default_tx_ring == NULL);
+
+	mcbi = &mip->mi_notify_cb_info;
+	ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
+	ASSERT(mip->mi_notify_bits == 0);
+	ASSERT(mip->mi_notify_thread == NULL);
+	ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
+	mcbi->mcbi_lockp = NULL;
 
-	/*
-	 * Look up its entry in the global hash table.
-	 */
-	rw_enter(&i_mac_impl_lock, RW_WRITER);
-	if (mip->mi_disabled) {
-		rw_exit(&i_mac_impl_lock);
-		return (ENOENT);
-	}
+	mcbi = &mip->mi_promisc_cb_info;
+	ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
+	ASSERT(mip->mi_promisc_list == NULL);
+	ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
+	mcbi->mcbi_lockp = NULL;
 
-	if (mip->mi_ref != 0) {
-		rw_exit(&i_mac_impl_lock);
-		return (EBUSY);
-	}
+	ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
+	ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
 
-	ASSERT(!mip->mi_exclusive);
+	mutex_destroy(&mip->mi_lock);
+	rw_destroy(&mip->mi_rw_lock);
 
-	mip->mi_ref++;
-	mip->mi_exclusive = B_TRUE;
-	rw_exit(&i_mac_impl_lock);
+	mutex_destroy(&mip->mi_promisc_lock);
+	cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
+	mutex_destroy(&mip->mi_notify_lock);
+	cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
+	mutex_destroy(&mip->mi_ring_lock);
+}
+
+/* ARGSUSED */
+static int
+i_mac_ring_ctor(void *buf, void *arg, int kmflag)
+{
+	mac_ring_t *ring = (mac_ring_t *)buf;
+
+	bzero(ring, sizeof (mac_ring_t));
+	cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
+	ring->mr_state = MR_FREE;
 	return (0);
 }
 
+/* ARGSUSED */
+static void
+i_mac_ring_dtor(void *buf, void *arg)
+{
+	mac_ring_t *ring = (mac_ring_t *)buf;
+
+	cv_destroy(&ring->mr_cv);
+	mutex_destroy(&ring->mr_lock);
+}
+
+/*
+ * Common functions to do mac callback addition and deletion. Currently this is
+ * used by promisc callbacks and notify callbacks. List addition and deletion
+ * need to take care of list walkers. List walkers in general, can't hold list
+ * locks and make upcall callbacks due to potential lock order and recursive
+ * reentry issues. Instead list walkers increment the list walker count to mark
+ * the presence of a walker thread. Addition can be carefully done to ensure
+ * that the list walker always sees either the old list or the new list.
+ * However the deletion can't be done while the walker is active, instead the
+ * deleting thread simply marks the entry as logically deleted. The last walker
+ * physically deletes and frees up the logically deleted entries when the walk
+ * is complete.
+ */
 void
-mac_rele_exclusive(mac_handle_t mh)
+mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
+    mac_cb_t *mcb_elem)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
+	mac_cb_t	*p;
+	mac_cb_t	**pp;
+
+	/* Verify it is not already in the list */
+	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
+		if (p == mcb_elem)
+			break;
+	}
+	VERIFY(p == NULL);
 
 	/*
-	 * Look up its entry in the global hash table.
+	 * Add it to the head of the callback list. The membar ensures that
+	 * the following list pointer manipulations reach global visibility
+	 * in exactly the program order below.
 	 */
-	rw_enter(&i_mac_impl_lock, RW_WRITER);
-	ASSERT(mip->mi_ref == 1 && mip->mi_exclusive);
-	mip->mi_ref--;
-	mip->mi_exclusive = B_FALSE;
-	rw_exit(&i_mac_impl_lock);
+	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
+
+	mcb_elem->mcb_nextp = *mcb_head;
+	membar_producer();
+	*mcb_head = mcb_elem;
 }
 
-int
-mac_open(const char *macname, mac_handle_t *mhp)
+/*
+ * Mark the entry as logically deleted. If there aren't any walkers unlink
+ * from the list. In either case return the corresponding status.
+ */
+boolean_t
+mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
+    mac_cb_t *mcb_elem)
 {
-	mac_impl_t	*mip;
-	int		err;
+	mac_cb_t	*p;
+	mac_cb_t	**pp;
 
+	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 	/*
-	 * Look up its entry in the global hash table.
+	 * Search the callback list for the entry to be removed
 	 */
-	if ((err = mac_hold(macname, &mip)) != 0)
-		return (err);
+	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
+		if (p == mcb_elem)
+			break;
+	}
+	VERIFY(p != NULL);
 
 	/*
-	 * Hold the dip associated to the MAC to prevent it from being
-	 * detached. For a softmac, its underlying dip is held by the
-	 * mi_open() callback.
-	 *
-	 * This is done to be more tolerant with some defective drivers,
-	 * which incorrectly handle mac_unregister() failure in their
-	 * xxx_detach() routine. For example, some drivers ignore the
-	 * failure of mac_unregister() and free all resources that
-	 * that are needed for data transmition.
+	 * If there are walkers just mark it as deleted and the last walker
+	 * will remove from the list and free it.
 	 */
-	e_ddi_hold_devi(mip->mi_dip);
+	if (mcbi->mcbi_walker_cnt != 0) {
+		p->mcb_flags |= MCB_CONDEMNED;
+		mcbi->mcbi_del_cnt++;
+		return (B_FALSE);
+	}
 
-	rw_enter(&mip->mi_gen_lock, RW_WRITER);
+	ASSERT(mcbi->mcbi_del_cnt == 0);
+	*pp = p->mcb_nextp;
+	p->mcb_nextp = NULL;
+	return (B_TRUE);
+}
 
-	if ((mip->mi_oref != 0) ||
-	    !(mip->mi_callbacks->mc_callbacks & MC_OPEN)) {
-		goto done;
+/*
+ * Wait for all pending callback removals to be completed
+ */
+void
+mac_callback_remove_wait(mac_cb_info_t *mcbi)
+{
+	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
+	while (mcbi->mcbi_del_cnt != 0) {
+		DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
+		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
 	}
+}
 
-	/*
-	 * Note that we do not hold i_mac_impl_lock when calling the
-	 * mc_open() callback function to avoid deadlock with the
-	 * i_mac_notify() function.
-	 */
-	if ((err = mip->mi_open(mip->mi_driver)) != 0) {
-		rw_exit(&mip->mi_gen_lock);
-		ddi_release_devi(mip->mi_dip);
-		mac_rele(mip);
-		return (err);
+/*
+ * The last mac callback walker does the cleanup. Walk the list and unlik
+ * all the logically deleted entries and construct a temporary list of
+ * removed entries. Return the list of removed entries to the caller.
+ */
+mac_cb_t *
+mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
+{
+	mac_cb_t	*p;
+	mac_cb_t	**pp;
+	mac_cb_t	*rmlist = NULL;		/* List of removed elements */
+	int	cnt = 0;
+
+	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
+	ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
+
+	pp = mcb_head;
+	while (*pp != NULL) {
+		if ((*pp)->mcb_flags & MCB_CONDEMNED) {
+			p = *pp;
+			*pp = p->mcb_nextp;
+			p->mcb_nextp = rmlist;
+			rmlist = p;
+			cnt++;
+			continue;
+		}
+		pp = &(*pp)->mcb_nextp;
 	}
 
-done:
-	mip->mi_oref++;
-	rw_exit(&mip->mi_gen_lock);
-	*mhp = (mac_handle_t)mip;
-	return (0);
+	ASSERT(mcbi->mcbi_del_cnt == cnt);
+	mcbi->mcbi_del_cnt = 0;
+	return (rmlist);
 }
 
-int
-mac_open_by_linkid(datalink_id_t linkid, mac_handle_t *mhp)
+boolean_t
+mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
 {
-	dls_dl_handle_t	dlh;
-	int		err;
-
-	if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0)
-		return (err);
+	mac_cb_t	*mcb;
 
-	if (dls_devnet_vid(dlh) != VLAN_ID_NONE) {
-		err = EINVAL;
-		goto done;
+	/* Verify it is not already in the list */
+	for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
+		if (mcb == mcb_elem)
+			return (B_TRUE);
 	}
 
-	dls_devnet_prop_task_wait(dlh);
-
-	err = mac_open(dls_devnet_mac(dlh), mhp);
-
-done:
-	dls_devnet_rele_tmp(dlh);
-	return (err);
+	return (B_FALSE);
 }
 
-int
-mac_open_by_linkname(const char *link, mac_handle_t *mhp)
+boolean_t
+mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
 {
-	datalink_id_t	linkid;
-	int		err;
+	boolean_t	found;
 
-	if ((err = dls_mgmt_get_linkid(link, &linkid)) != 0)
-		return (err);
-	return (mac_open_by_linkid(linkid, mhp));
+	mutex_enter(mcbi->mcbi_lockp);
+	found = mac_callback_lookup(mcb_headp, mcb_elem);
+	mutex_exit(mcbi->mcbi_lockp);
+
+	return (found);
 }
 
+/* Free the list of removed callbacks */
 void
-mac_close(mac_handle_t mh)
+mac_callback_free(mac_cb_t *rmlist)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
+	mac_cb_t	*mcb;
+	mac_cb_t	*mcb_next;
 
-	rw_enter(&mip->mi_gen_lock, RW_WRITER);
-
-	ASSERT(mip->mi_oref != 0);
-	if (--mip->mi_oref == 0) {
-		if ((mip->mi_callbacks->mc_callbacks & MC_CLOSE))
-			mip->mi_close(mip->mi_driver);
+	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
+		mcb_next = mcb->mcb_nextp;
+		kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
 	}
-	rw_exit(&mip->mi_gen_lock);
-
-	ddi_release_devi(mip->mi_dip);
-	mac_rele(mip);
 }
 
-const mac_info_t *
-mac_info(mac_handle_t mh)
+/*
+ * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
+ * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
+ * is only a single shared total walker count, and an entry can't be physically
+ * unlinked if a walker is active on either list. The last walker does this
+ * cleanup of logically deleted entries.
+ */
+void
+i_mac_promisc_walker_cleanup(mac_impl_t *mip)
 {
-	return (&((mac_impl_t *)mh)->mi_info);
-}
+	mac_cb_t	*rmlist;
+	mac_cb_t	*mcb;
+	mac_cb_t	*mcb_next;
+	mac_promisc_impl_t	*mpip;
 
-dev_info_t *
-mac_devinfo_get(mac_handle_t mh)
-{
-	return (((mac_impl_t *)mh)->mi_dip);
+	/*
+	 * Construct a temporary list of deleted callbacks by walking the
+	 * the mi_promisc_list. Then for each entry in the temporary list,
+	 * remove it from the mci_promisc_list and free the entry.
+	 */
+	rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
+	    &mip->mi_promisc_list);
+
+	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
+		mcb_next = mcb->mcb_nextp;
+		mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
+		VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
+		    &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
+		mcb->mcb_flags = 0;
+		mcb->mcb_nextp = NULL;
+		kmem_cache_free(mac_promisc_impl_cache, mpip);
+	}
 }
 
-const char *
-mac_name(mac_handle_t mh)
+void
+i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
 {
-	return (((mac_impl_t *)mh)->mi_name);
-}
+	mac_cb_info_t	*mcbi;
 
-minor_t
-mac_minor(mac_handle_t mh)
-{
-	return (((mac_impl_t *)mh)->mi_minor);
+	/*
+	 * Signal the notify thread even after mi_ref has become zero and
+	 * mi_disabled is set. The synchronization with the notify thread
+	 * happens in mac_unregister and that implies the driver must make
+	 * sure it is single-threaded (with respect to mac calls) and that
+	 * all pending mac calls have returned before it calls mac_unregister
+	 */
+	rw_enter(&i_mac_impl_lock, RW_READER);
+	if (mip->mi_state_flags & MIS_DISABLED)
+		goto exit;
+
+	/*
+	 * Guard against incorrect notifications.  (Running a newer
+	 * mac client against an older implementation?)
+	 */
+	if (type >= MAC_NNOTE)
+		goto exit;
+
+	mcbi = &mip->mi_notify_cb_info;
+	mutex_enter(mcbi->mcbi_lockp);
+	mip->mi_notify_bits |= (1 << type);
+	cv_broadcast(&mcbi->mcbi_cv);
+	mutex_exit(mcbi->mcbi_lockp);
+
+exit:
+	rw_exit(&i_mac_impl_lock);
 }
 
-uint64_t
-mac_stat_get(mac_handle_t mh, uint_t stat)
+/*
+ * Mac serialization primitives. Please see the block comment at the
+ * top of the file.
+ */
+void
+i_mac_perim_enter(mac_impl_t *mip)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	uint64_t	val;
-	int		ret;
-
-	/*
-	 * The range of stat determines where it is maintained.  Stat
-	 * values from 0 up to (but not including) MAC_STAT_MIN are
-	 * mainteined by the mac module itself.  Everything else is
-	 * maintained by the driver.
-	 */
-	if (stat < MAC_STAT_MIN) {
-		/* These stats are maintained by the mac module itself. */
-		switch (stat) {
-		case MAC_STAT_LINK_STATE:
-			return (mip->mi_linkstate);
-		case MAC_STAT_LINK_UP:
-			return (mip->mi_linkstate == LINK_STATE_UP);
-		case MAC_STAT_PROMISC:
-			return (mip->mi_devpromisc != 0);
-		default:
-			ASSERT(B_FALSE);
-		}
-	}
+	mac_client_impl_t	*mcip;
 
-	/*
-	 * Call the driver to get the given statistic.
-	 */
-	ret = mip->mi_getstat(mip->mi_driver, stat, &val);
-	if (ret != 0) {
+	if (mip->mi_state_flags & MIS_IS_VNIC) {
 		/*
-		 * The driver doesn't support this statistic.  Get the
-		 * statistic's default value.
+		 * This is a VNIC. Return the lower mac since that is what
+		 * we want to serialize on.
 		 */
-		val = mac_stat_default(mip, stat);
+		mcip = mac_vnic_lower(mip);
+		mip = mcip->mci_mip;
+	}
+
+	mutex_enter(&mip->mi_perim_lock);
+	if (mip->mi_perim_owner == curthread) {
+		mip->mi_perim_ocnt++;
+		mutex_exit(&mip->mi_perim_lock);
+		return;
 	}
-	return (val);
+
+	while (mip->mi_perim_owner != NULL)
+		cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
+
+	mip->mi_perim_owner = curthread;
+	ASSERT(mip->mi_perim_ocnt == 0);
+	mip->mi_perim_ocnt++;
+#ifdef DEBUG
+	mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
+	    MAC_PERIM_STACK_DEPTH);
+#endif
+	mutex_exit(&mip->mi_perim_lock);
 }
 
 int
-mac_start(mac_handle_t mh)
+i_mac_perim_enter_nowait(mac_impl_t *mip)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	int		err;
+	/*
+	 * The vnic is a special case, since the serialization is done based
+	 * on the lower mac. If the lower mac is busy, it does not imply the
+	 * vnic can't be unregistered. But in the case of other drivers,
+	 * a busy perimeter or open mac handles implies that the mac is busy
+	 * and can't be unregistered.
+	 */
+	if (mip->mi_state_flags & MIS_IS_VNIC) {
+		i_mac_perim_enter(mip);
+		return (0);
+	}
 
-	ASSERT(mip->mi_start != NULL);
+	mutex_enter(&mip->mi_perim_lock);
+	if (mip->mi_perim_owner != NULL) {
+		mutex_exit(&mip->mi_perim_lock);
+		return (EBUSY);
+	}
+	ASSERT(mip->mi_perim_ocnt == 0);
+	mip->mi_perim_owner = curthread;
+	mip->mi_perim_ocnt++;
+	mutex_exit(&mip->mi_perim_lock);
 
-	rw_enter(&(mip->mi_state_lock), RW_WRITER);
+	return (0);
+}
 
-	/*
-	 * Check whether the device is already started.
-	 */
-	if (mip->mi_active++ != 0) {
+void
+i_mac_perim_exit(mac_impl_t *mip)
+{
+	mac_client_impl_t *mcip;
+
+	if (mip->mi_state_flags & MIS_IS_VNIC) {
 		/*
-		 * It's already started so there's nothing more to do.
+		 * This is a VNIC. Return the lower mac since that is what
+		 * we want to serialize on.
 		 */
-		err = 0;
-		goto done;
+		mcip = mac_vnic_lower(mip);
+		mip = mcip->mci_mip;
 	}
 
-	/*
-	 * Start the device.
-	 */
-	if ((err = mip->mi_start(mip->mi_driver)) != 0)
-		--mip->mi_active;
+	ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
 
-done:
-	rw_exit(&(mip->mi_state_lock));
-	return (err);
+	mutex_enter(&mip->mi_perim_lock);
+	if (--mip->mi_perim_ocnt == 0) {
+		mip->mi_perim_owner = NULL;
+		cv_signal(&mip->mi_perim_cv);
+	}
+	mutex_exit(&mip->mi_perim_lock);
 }
 
-void
-mac_stop(mac_handle_t mh)
+/*
+ * Returns whether the current thread holds the mac perimeter. Used in making
+ * assertions.
+ */
+boolean_t
+mac_perim_held(mac_handle_t mh)
 {
 	mac_impl_t	*mip = (mac_impl_t *)mh;
+	mac_client_impl_t *mcip;
 
-	ASSERT(mip->mi_stop != NULL);
-
-	rw_enter(&(mip->mi_state_lock), RW_WRITER);
-
-	/*
-	 * Check whether the device is still needed.
-	 */
-	ASSERT(mip->mi_active != 0);
-	if (--mip->mi_active != 0) {
+	if (mip->mi_state_flags & MIS_IS_VNIC) {
 		/*
-		 * It's still needed so there's nothing more to do.
+		 * This is a VNIC. Return the lower mac since that is what
+		 * we want to serialize on.
 		 */
-		goto done;
+		mcip = mac_vnic_lower(mip);
+		mip = mcip->mci_mip;
 	}
+	return (mip->mi_perim_owner == curthread);
+}
 
+/*
+ * mac client interfaces to enter the mac perimeter of a mac end point, given
+ * its mac handle, or macname or linkid.
+ */
+void
+mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	i_mac_perim_enter(mip);
 	/*
-	 * Stop the device.
+	 * The mac_perim_handle_t returned encodes the 'mip' and whether a
+	 * mac_open has been done internally while entering the perimeter.
+	 * This information is used in mac_perim_exit
 	 */
-	mip->mi_stop(mip->mi_driver);
-
-done:
-	rw_exit(&(mip->mi_state_lock));
+	MAC_ENCODE_MPH(*mphp, mip, 0);
 }
 
 int
-mac_multicst_add(mac_handle_t mh, const uint8_t *addr)
+mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
 {
-	mac_impl_t		*mip = (mac_impl_t *)mh;
-	mac_multicst_addr_t	**pp;
-	mac_multicst_addr_t	*p;
-	int			err;
-
-	ASSERT(mip->mi_multicst != NULL);
+	int	err;
+	mac_handle_t	mh;
 
-	/*
-	 * Verify the address.
-	 */
-	if ((err = mip->mi_type->mt_ops.mtops_multicst_verify(addr,
-	    mip->mi_pdata)) != 0) {
+	if ((err = mac_open(name, &mh)) != 0)
 		return (err);
-	}
 
-	/*
-	 * Check whether the given address is already enabled.
-	 */
-	rw_enter(&(mip->mi_data_lock), RW_WRITER);
-	for (pp = &(mip->mi_mmap); (p = *pp) != NULL; pp = &(p->mma_nextp)) {
-		if (bcmp(p->mma_addr, addr, mip->mi_type->mt_addr_length) ==
-		    0) {
-			/*
-			 * The address is already enabled so just bump the
-			 * reference count.
-			 */
-			p->mma_ref++;
-			err = 0;
-			goto done;
-		}
-	}
+	mac_perim_enter_by_mh(mh, mphp);
+	MAC_ENCODE_MPH(*mphp, mh, 1);
+	return (0);
+}
 
-	/*
-	 * Allocate a new list entry.
-	 */
-	if ((p = kmem_zalloc(sizeof (mac_multicst_addr_t),
-	    KM_NOSLEEP)) == NULL) {
-		err = ENOMEM;
-		goto done;
-	}
+int
+mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
+{
+	int	err;
+	mac_handle_t	mh;
 
-	/*
-	 * Enable a new multicast address.
-	 */
-	if ((err = mip->mi_multicst(mip->mi_driver, B_TRUE, addr)) != 0) {
-		kmem_free(p, sizeof (mac_multicst_addr_t));
-		goto done;
-	}
+	if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
+		return (err);
 
-	/*
-	 * Add the address to the list of enabled addresses.
-	 */
-	bcopy(addr, p->mma_addr, mip->mi_type->mt_addr_length);
-	p->mma_ref++;
-	*pp = p;
+	mac_perim_enter_by_mh(mh, mphp);
+	MAC_ENCODE_MPH(*mphp, mh, 1);
+	return (0);
+}
 
-done:
-	rw_exit(&(mip->mi_data_lock));
-	return (err);
+void
+mac_perim_exit(mac_perim_handle_t mph)
+{
+	mac_impl_t	*mip;
+	boolean_t	need_close;
+
+	MAC_DECODE_MPH(mph, mip, need_close);
+	i_mac_perim_exit(mip);
+	if (need_close)
+		mac_close((mac_handle_t)mip);
 }
 
 int
-mac_multicst_remove(mac_handle_t mh, const uint8_t *addr)
+mac_hold(const char *macname, mac_impl_t **pmip)
 {
-	mac_impl_t		*mip = (mac_impl_t *)mh;
-	mac_multicst_addr_t	**pp;
-	mac_multicst_addr_t	*p;
-	int			err;
+	mac_impl_t	*mip;
+	int		err;
 
-	ASSERT(mip->mi_multicst != NULL);
+	/*
+	 * Check the device name length to make sure it won't overflow our
+	 * buffer.
+	 */
+	if (strlen(macname) >= MAXNAMELEN)
+		return (EINVAL);
 
 	/*
-	 * Find the entry in the list for the given address.
+	 * Look up its entry in the global hash table.
 	 */
-	rw_enter(&(mip->mi_data_lock), RW_WRITER);
-	for (pp = &(mip->mi_mmap); (p = *pp) != NULL; pp = &(p->mma_nextp)) {
-		if (bcmp(p->mma_addr, addr, mip->mi_type->mt_addr_length) ==
-		    0) {
-			if (--p->mma_ref == 0)
-				break;
+	rw_enter(&i_mac_impl_lock, RW_WRITER);
+	err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
+	    (mod_hash_val_t *)&mip);
 
-			/*
-			 * There is still a reference to this address so
-			 * there's nothing more to do.
-			 */
-			err = 0;
-			goto done;
-		}
+	if (err != 0) {
+		rw_exit(&i_mac_impl_lock);
+		return (ENOENT);
 	}
 
-	/*
-	 * We did not find an entry for the given address so it is not
-	 * currently enabled.
-	 */
-	if (p == NULL) {
-		err = ENOENT;
-		goto done;
+	if (mip->mi_state_flags & MIS_DISABLED) {
+		rw_exit(&i_mac_impl_lock);
+		return (ENOENT);
 	}
-	ASSERT(p->mma_ref == 0);
 
-	/*
-	 * Disable the multicast address.
-	 */
-	if ((err = mip->mi_multicst(mip->mi_driver, B_FALSE, addr)) != 0) {
-		p->mma_ref++;
-		goto done;
+	if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
+		rw_exit(&i_mac_impl_lock);
+		return (EBUSY);
 	}
 
-	/*
-	 * Remove it from the list.
-	 */
-	*pp = p->mma_nextp;
-	kmem_free(p, sizeof (mac_multicst_addr_t));
+	mip->mi_ref++;
+	rw_exit(&i_mac_impl_lock);
 
-done:
-	rw_exit(&(mip->mi_data_lock));
-	return (err);
+	*pmip = mip;
+	return (0);
 }
 
-/*
- * mac_unicst_verify: Verifies the passed address. It fails
- * if the passed address is a group address or has incorrect length.
- */
-boolean_t
-mac_unicst_verify(mac_handle_t mh, const uint8_t *addr, uint_t len)
+void
+mac_rele(mac_impl_t *mip)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-
-	/*
-	 * Verify the address.
-	 */
-	if ((len != mip->mi_type->mt_addr_length) ||
-	    (mip->mi_type->mt_ops.mtops_unicst_verify(addr,
-	    mip->mi_pdata)) != 0) {
-		return (B_FALSE);
-	} else {
-		return (B_TRUE);
+	rw_enter(&i_mac_impl_lock, RW_WRITER);
+	ASSERT(mip->mi_ref != 0);
+	if (--mip->mi_ref == 0) {
+		ASSERT(mip->mi_nactiveclients == 0 &&
+		    !(mip->mi_state_flags & MIS_EXCLUSIVE));
 	}
+	rw_exit(&i_mac_impl_lock);
 }
 
+/*
+ * This function is called only by mac_client_open.
+ */
 int
-mac_unicst_set(mac_handle_t mh, const uint8_t *addr)
+mac_start(mac_impl_t *mip)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	int		err;
-	boolean_t	notify = B_FALSE;
-
-	ASSERT(mip->mi_unicst != NULL);
+	int		err = 0;
 
-	/*
-	 * Verify the address.
-	 */
-	if ((err = mip->mi_type->mt_ops.mtops_unicst_verify(addr,
-	    mip->mi_pdata)) != 0) {
-		return (err);
-	}
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+	ASSERT(mip->mi_start != NULL);
 
 	/*
-	 * Program the new unicast address.
+	 * Check whether the device is already started.
 	 */
-	rw_enter(&(mip->mi_data_lock), RW_WRITER);
+	if (mip->mi_active++ == 0) {
+		mac_ring_t *ring = NULL;
 
-	/*
-	 * If address doesn't change, do nothing.
-	 * This check is necessary otherwise it may call into mac_unicst_set
-	 * recursively.
-	 */
-	if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) == 0)
-		goto done;
+		/*
+		 * Start the device.
+		 */
+		err = mip->mi_start(mip->mi_driver);
+		if (err != 0) {
+			mip->mi_active--;
+			return (err);
+		}
 
-	if ((err = mip->mi_unicst(mip->mi_driver, addr)) != 0)
-		goto done;
+		/*
+		 * Start the default tx ring.
+		 */
+		if (mip->mi_default_tx_ring != NULL) {
 
-	/*
-	 * Save the address and flag that we need to send a notification.
-	 */
-	bcopy(addr, mip->mi_addr, mip->mi_type->mt_addr_length);
-	notify = B_TRUE;
+			ring = (mac_ring_t *)mip->mi_default_tx_ring;
+			err = mac_start_ring(ring);
+			if (err != 0) {
+				mip->mi_active--;
+				return (err);
+			}
+			ring->mr_state = MR_INUSE;
+		}
 
-done:
-	rw_exit(&(mip->mi_data_lock));
+		if (mip->mi_rx_groups != NULL) {
+			/*
+			 * Start the default ring, since it will be needed
+			 * to receive broadcast and multicast traffic for
+			 * both primary and non-primary MAC clients.
+			 */
+			mac_group_t *grp = &mip->mi_rx_groups[0];
 
-	if (notify)
-		i_mac_notify(mip, MAC_NOTE_UNICST);
+			ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
+			err = mac_start_group_and_rings(grp);
+			if (err != 0) {
+				mip->mi_active--;
+				if (ring != NULL) {
+					mac_stop_ring(ring);
+					ring->mr_state = MR_FREE;
+				}
+				return (err);
+			}
+			mac_set_rx_group_state(grp, MAC_GROUP_STATE_SHARED);
+		}
+	}
 
 	return (err);
 }
 
+/*
+ * This function is called only by mac_client_close.
+ */
 void
-mac_unicst_get(mac_handle_t mh, uint8_t *addr)
+mac_stop(mac_impl_t *mip)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
+	ASSERT(mip->mi_stop != NULL);
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
 
 	/*
-	 * Copy out the current unicast source address.
+	 * Check whether the device is still needed.
 	 */
-	rw_enter(&(mip->mi_data_lock), RW_READER);
-	bcopy(mip->mi_addr, addr, mip->mi_type->mt_addr_length);
-	rw_exit(&(mip->mi_data_lock));
-}
+	ASSERT(mip->mi_active != 0);
+	if (--mip->mi_active == 0) {
+		if (mip->mi_rx_groups != NULL) {
+			/*
+			 * There should be no more active clients since the
+			 * MAC is being stopped. Stop the default RX group
+			 * and transition it back to registered state.
+			 */
+			mac_group_t *grp = &mip->mi_rx_groups[0];
 
-void
-mac_dest_get(mac_handle_t mh, uint8_t *addr)
-{
-	mac_impl_t	*mip = (mac_impl_t *)mh;
+			/*
+			 * When clients are torn down, the groups
+			 * are release via mac_release_rx_group which
+			 * knows the the default group is always in
+			 * started mode since broadcast uses it. So
+			 * we can assert that their are no clients
+			 * (since mac_bcast_add doesn't register itself
+			 * as a client) and group is in SHARED state.
+			 */
+			ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
+			ASSERT(MAC_RX_GROUP_NO_CLIENT(grp) &&
+			    mip->mi_nactiveclients == 0);
+			mac_stop_group_and_rings(grp);
+			mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
+		}
 
-	/*
-	 * Copy out the current destination address.
-	 */
-	rw_enter(&(mip->mi_data_lock), RW_READER);
-	bcopy(mip->mi_dstaddr, addr, mip->mi_type->mt_addr_length);
-	rw_exit(&(mip->mi_data_lock));
+		if (mip->mi_default_tx_ring != NULL) {
+			mac_ring_t *ring;
+
+			ring = (mac_ring_t *)mip->mi_default_tx_ring;
+			mac_stop_ring(ring);
+			ring->mr_state = MR_FREE;
+		}
+
+		/*
+		 * Stop the device.
+		 */
+		mip->mi_stop(mip->mi_driver);
+	}
 }
 
 int
-mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
+i_mac_promisc_set(mac_impl_t *mip, boolean_t on, mac_promisc_type_t ptype)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
 	int		err = 0;
 
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
 	ASSERT(mip->mi_setpromisc != NULL);
 	ASSERT(ptype == MAC_DEVPROMISC || ptype == MAC_PROMISC);
 
@@ -958,7 +1142,6 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
 	 * For details on the distinction between "device promiscuous mode"
 	 * and "MAC promiscuous mode", see PSARC/2005/289.
 	 */
-	rw_enter(&(mip->mi_data_lock), RW_WRITER);
 	if (on) {
 		/*
 		 * Enable promiscuous mode on the device if not yet enabled.
@@ -967,7 +1150,7 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
 			err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
 			if (err != 0) {
 				mip->mi_devpromisc--;
-				goto done;
+				return (err);
 			}
 			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
 		}
@@ -978,10 +1161,9 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
 		if (ptype == MAC_PROMISC && mip->mi_promisc++ == 0)
 			i_mac_notify(mip, MAC_NOTE_PROMISC);
 	} else {
-		if (mip->mi_devpromisc == 0) {
-			err = EPROTO;
-			goto done;
-		}
+		if (mip->mi_devpromisc == 0)
+			return (EPROTO);
+
 		/*
 		 * Disable promiscuous mode on the device if this is the last
 		 * enabling.
@@ -990,7 +1172,7 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
 			err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
 			if (err != 0) {
 				mip->mi_devpromisc++;
-				goto done;
+				return (err);
 			}
 			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
 		}
@@ -1003,11 +1185,27 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
 			i_mac_notify(mip, MAC_NOTE_PROMISC);
 	}
 
-done:
-	rw_exit(&(mip->mi_data_lock));
-	return (err);
+	return (0);
 }
 
+int
+mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+	int		rv;
+
+	i_mac_perim_enter(mip);
+	rv = i_mac_promisc_set(mip, on, ptype);
+	i_mac_perim_exit(mip);
+
+	return (rv);
+}
+
+/*
+ * The promiscuity state can change any time. If the caller needs to take
+ * actions that are atomic with the promiscuity state, then the caller needs
+ * to bracket the entire sequence with mac_perim_enter/exit
+ */
 boolean_t
 mac_promisc_get(mac_handle_t mh, mac_promisc_type_t ptype)
 {
@@ -1024,1296 +1222,1162 @@ mac_promisc_get(mac_handle_t mh, mac_promisc_type_t ptype)
 		return (mip->mi_promisc != 0);
 }
 
+/*
+ * Invoked at MAC instance attach time to initialize the list
+ * of factory MAC addresses supported by a MAC instance. This function
+ * builds a local cache in the mac_impl_t for the MAC addresses
+ * supported by the underlying hardware. The MAC clients themselves
+ * use the mac_addr_factory*() functions to query and reserve
+ * factory MAC addresses.
+ */
 void
-mac_sdu_get(mac_handle_t mh, uint_t *min_sdu, uint_t *max_sdu)
+mac_addr_factory_init(mac_impl_t *mip)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
+	mac_capab_multifactaddr_t capab;
+	uint8_t *addr;
+	int i;
 
-	if (min_sdu != NULL)
-		*min_sdu = mip->mi_sdu_min;
-	if (max_sdu != NULL)
-		*max_sdu = mip->mi_sdu_max;
-}
-
-void
-mac_resources(mac_handle_t mh)
-{
-	mac_impl_t	*mip = (mac_impl_t *)mh;
+	/*
+	 * First round to see how many factory MAC addresses are available.
+	 */
+	bzero(&capab, sizeof (capab));
+	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
+	    &capab) || (capab.mcm_naddr == 0)) {
+		/*
+		 * The MAC instance doesn't support multiple factory
+		 * MAC addresses, we're done here.
+		 */
+		return;
+	}
 
 	/*
-	 * If the driver supports resource registration, call the driver to
-	 * ask it to register its resources.
+	 * Allocate the space and get all the factory addresses.
 	 */
-	if (mip->mi_callbacks->mc_callbacks & MC_RESOURCES)
-		mip->mi_resources(mip->mi_driver);
+	addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
+	capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
+
+	mip->mi_factory_addr_num = capab.mcm_naddr;
+	mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
+	    sizeof (mac_factory_addr_t), KM_SLEEP);
+
+	for (i = 0; i < capab.mcm_naddr; i++) {
+		bcopy(addr + i * MAXMACADDRLEN,
+		    mip->mi_factory_addr[i].mfa_addr,
+		    mip->mi_type->mt_addr_length);
+		mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
+	}
+
+	kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
 }
 
 void
-mac_ioctl(mac_handle_t mh, queue_t *wq, mblk_t *bp)
+mac_addr_factory_fini(mac_impl_t *mip)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	int cmd = ((struct iocblk *)bp->b_rptr)->ioc_cmd;
-
-	if ((cmd == ND_GET && (mip->mi_callbacks->mc_callbacks & MC_GETPROP)) ||
-	    (cmd == ND_SET && (mip->mi_callbacks->mc_callbacks & MC_SETPROP))) {
-		/*
-		 * If ndd props were registered, call them.
-		 * Note that ndd ioctls are Obsolete
-		 */
-		mac_ndd_ioctl(mip, wq, bp);
+	if (mip->mi_factory_addr == NULL) {
+		ASSERT(mip->mi_factory_addr_num == 0);
 		return;
 	}
 
-	/*
-	 * Call the driver to handle the ioctl.  The driver may not support
-	 * any ioctls, in which case we reply with a NAK on its behalf.
-	 */
-	if (mip->mi_callbacks->mc_callbacks & MC_IOCTL)
-		mip->mi_ioctl(mip->mi_driver, wq, bp);
-	else
-		miocnak(wq, bp, 0, EINVAL);
+	kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
+	    sizeof (mac_factory_addr_t));
+
+	mip->mi_factory_addr = NULL;
+	mip->mi_factory_addr_num = 0;
 }
 
-const mac_txinfo_t *
-mac_do_tx_get(mac_handle_t mh, boolean_t is_vnic)
+/*
+ * Reserve a factory MAC address. If *slot is set to -1, the function
+ * attempts to reserve any of the available factory MAC addresses and
+ * returns the reserved slot id. If no slots are available, the function
+ * returns ENOSPC. If *slot is not set to -1, the function reserves
+ * the specified slot if it is available, or returns EBUSY is the slot
+ * is already used. Returns ENOTSUP if the underlying MAC does not
+ * support multiple factory addresses. If the slot number is not -1 but
+ * is invalid, returns EINVAL.
+ */
+int
+mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	mac_txinfo_t	*mtp;
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_impl_t *mip = mcip->mci_mip;
+	int i, ret = 0;
 
+	i_mac_perim_enter(mip);
 	/*
-	 * Grab the lock to prevent us from racing with MAC_PROMISC being
-	 * changed.  This is sufficient since MAC clients are careful to always
-	 * call mac_txloop_add() prior to enabling MAC_PROMISC, and to disable
-	 * MAC_PROMISC prior to calling mac_txloop_remove().
+	 * Protect against concurrent readers that may need a self-consistent
+	 * view of the factory addresses
 	 */
-	rw_enter(&mip->mi_tx_lock, RW_READER);
+	rw_enter(&mip->mi_rw_lock, RW_WRITER);
 
-	if (mac_promisc_get(mh, MAC_PROMISC)) {
-		ASSERT(mip->mi_mtfp != NULL);
-		if (mip->mi_vnic_present && !is_vnic) {
-			mtp = &mip->mi_vnic_txloopinfo;
-		} else {
-			mtp = &mip->mi_txloopinfo;
+	if (mip->mi_factory_addr_num == 0) {
+		ret = ENOTSUP;
+		goto bail;
+	}
+
+	if (*slot != -1) {
+		/* check the specified slot */
+		if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
+			ret = EINVAL;
+			goto bail;
+		}
+		if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
+			ret = EBUSY;
+			goto bail;
 		}
 	} else {
-		if (mip->mi_vnic_present && !is_vnic) {
-			mtp = &mip->mi_vnic_txinfo;
-		} else {
-			/*
-			 * Note that we cannot ASSERT() that mip->mi_mtfp is
-			 * NULL, because to satisfy the above ASSERT(), we
-			 * have to disable MAC_PROMISC prior to calling
-			 * mac_txloop_remove().
-			 */
-			mtp = &mip->mi_txinfo;
+		/* pick the next available slot */
+		for (i = 0; i < mip->mi_factory_addr_num; i++) {
+			if (!mip->mi_factory_addr[i].mfa_in_use)
+				break;
+		}
+
+		if (i == mip->mi_factory_addr_num) {
+			ret = ENOSPC;
+			goto bail;
 		}
+		*slot = i+1;
 	}
 
-	rw_exit(&mip->mi_tx_lock);
-	return (mtp);
-}
+	mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
+	mip->mi_factory_addr[*slot-1].mfa_client = mcip;
 
-/*
- * Invoked by VNIC to obtain the transmit entry point.
- */
-const mac_txinfo_t *
-mac_vnic_tx_get(mac_handle_t mh)
-{
-	return (mac_do_tx_get(mh, B_TRUE));
+bail:
+	rw_exit(&mip->mi_rw_lock);
+	i_mac_perim_exit(mip);
+	return (ret);
 }
 
 /*
- * Invoked by any non-VNIC client to obtain the transmit entry point.
- * If a VNIC is present, the VNIC transmit function provided by the VNIC
- * will be returned to the MAC client.
+ * Release the specified factory MAC address slot.
  */
-const mac_txinfo_t *
-mac_tx_get(mac_handle_t mh)
-{
-	return (mac_do_tx_get(mh, B_FALSE));
-}
-
-link_state_t
-mac_link_get(mac_handle_t mh)
-{
-	return (((mac_impl_t *)mh)->mi_linkstate);
-}
-
-mac_notify_handle_t
-mac_notify_add(mac_handle_t mh, mac_notify_t notify, void *arg)
+void
+mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
 {
-	mac_impl_t		*mip = (mac_impl_t *)mh;
-	mac_notify_fn_t		*mnfp;
-
-	mnfp = kmem_zalloc(sizeof (mac_notify_fn_t), KM_SLEEP);
-	mnfp->mnf_fn = notify;
-	mnfp->mnf_arg = arg;
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_impl_t *mip = mcip->mci_mip;
 
+	i_mac_perim_enter(mip);
 	/*
-	 * Add it to the head of the 'notify' callback list.
+	 * Protect against concurrent readers that may need a self-consistent
+	 * view of the factory addresses
 	 */
-	rw_enter(&mip->mi_notify_lock, RW_WRITER);
-	mnfp->mnf_nextp = mip->mi_mnfp;
-	mip->mi_mnfp = mnfp;
-	rw_exit(&mip->mi_notify_lock);
+	rw_enter(&mip->mi_rw_lock, RW_WRITER);
 
-	return ((mac_notify_handle_t)mnfp);
+	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
+	ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
+
+	mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
+
+	rw_exit(&mip->mi_rw_lock);
+	i_mac_perim_exit(mip);
 }
 
+/*
+ * Stores in mac_addr the value of the specified MAC address. Returns
+ * 0 on success, or EINVAL if the slot number is not valid for the MAC.
+ * The caller must provide a string of at least MAXNAMELEN bytes.
+ */
 void
-mac_notify_remove(mac_handle_t mh, mac_notify_handle_t mnh)
+mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
+    uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
 {
-	mac_impl_t		*mip = (mac_impl_t *)mh;
-	mac_notify_fn_t		*mnfp = (mac_notify_fn_t *)mnh;
-	mac_notify_fn_t		**pp;
-	mac_notify_fn_t		*p;
+	mac_impl_t *mip = (mac_impl_t *)mh;
+	boolean_t in_use;
+
+	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
 
 	/*
-	 * Search the 'notify' callback list for the function closure.
+	 * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
+	 * and mi_rw_lock
 	 */
-	rw_enter(&mip->mi_notify_lock, RW_WRITER);
-	for (pp = &(mip->mi_mnfp); (p = *pp) != NULL;
-	    pp = &(p->mnf_nextp)) {
-		if (p == mnfp)
-			break;
+	rw_enter(&mip->mi_rw_lock, RW_READER);
+	bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
+	*addr_len = mip->mi_type->mt_addr_length;
+	in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
+	if (in_use && client_name != NULL) {
+		bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
+		    client_name, MAXNAMELEN);
 	}
-	ASSERT(p != NULL);
+	if (in_use_arg != NULL)
+		*in_use_arg = in_use;
+	rw_exit(&mip->mi_rw_lock);
+}
 
-	/*
-	 * Remove it from the list.
-	 */
-	*pp = p->mnf_nextp;
-	rw_exit(&mip->mi_notify_lock);
+/*
+ * Returns the number of factory MAC addresses (in addition to the
+ * primary MAC address), 0 if the underlying MAC doesn't support
+ * that feature.
+ */
+uint_t
+mac_addr_factory_num(mac_handle_t mh)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
 
-	/*
-	 * Free it.
-	 */
-	kmem_free(mnfp, sizeof (mac_notify_fn_t));
+	return (mip->mi_factory_addr_num);
 }
 
+
 void
-mac_notify(mac_handle_t mh)
+mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
 {
-	mac_impl_t		*mip = (mac_impl_t *)mh;
-	mac_notify_type_t	type;
+	mac_ring_t	*ring;
 
-	for (type = 0; type < MAC_NNOTE; type++)
-		i_mac_notify(mip, type);
+	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
+		ring->mr_flag &= ~flag;
 }
 
 /*
- * Register a receive function for this mac.
- * More information on this function's interaction with mac_rx()
- * can be found atop mac_rx().
+ * The following mac_hwrings_xxx() functions are private mac client functions
+ * used by the aggr driver to access and control the underlying HW Rx group
+ * and rings. In this case, the aggr driver has exclusive control of the
+ * underlying HW Rx group/rings, it calls the following functions to
+ * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
+ * addresses, or set up the Rx callback.
  */
-mac_rx_handle_t
-mac_do_rx_add(mac_handle_t mh, mac_rx_t rx, void *arg, boolean_t is_active)
+/* ARGSUSED */
+static void
+mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
+    mblk_t *mp_chain, boolean_t loopback)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	mac_rx_fn_t	*mrfp;
+	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
+	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
+	mac_direct_rx_t		proc;
+	void			*arg1;
+	mac_resource_handle_t	arg2;
 
-	mrfp = kmem_zalloc(sizeof (mac_rx_fn_t), KM_SLEEP);
-	mrfp->mrf_fn = rx;
-	mrfp->mrf_arg = arg;
-	mrfp->mrf_active = is_active;
+	proc = srs_rx->sr_func;
+	arg1 = srs_rx->sr_arg1;
+	arg2 = mac_srs->srs_mrh;
 
-	/*
-	 * Add it to the head of the 'rx' callback list.
-	 */
-	rw_enter(&(mip->mi_rx_lock), RW_WRITER);
+	proc(arg1, arg2, mp_chain, NULL);
+}
+
+/*
+ * This function is called to get the list of HW rings that are reserved by
+ * an exclusive mac client.
+ *
+ * Return value: the number of HW rings.
+ */
+int
+mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
+    mac_ring_handle_t *hwrh)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	flow_entry_t		*flent = mcip->mci_flent;
+	mac_group_t		*grp = flent->fe_rx_ring_group;
+	mac_ring_t		*ring;
+	int			cnt = 0;
 
 	/*
-	 * mac_rx() will only call callbacks that are marked inuse.
+	 * The mac client did not reserve any RX group, return directly.
+	 * This is probably because the underlying MAC does not support
+	 * any RX groups.
 	 */
-	mrfp->mrf_inuse = B_TRUE;
-	mrfp->mrf_nextp = mip->mi_mrfp;
+	*hwgh = NULL;
+	if (grp == NULL)
+		return (0);
 
 	/*
-	 * mac_rx() could be traversing the remainder of the list
-	 * and miss the new callback we're adding here. This is not a problem
-	 * because we do not guarantee the callback to take effect immediately
-	 * after mac_rx_add() returns.
+	 * This RX group must be reserved by this mac client.
 	 */
-	mip->mi_mrfp = mrfp;
-	rw_exit(&(mip->mi_rx_lock));
+	ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
+	    (mch == (mac_client_handle_t)(MAC_RX_GROUP_ONLY_CLIENT(grp))));
 
-	return ((mac_rx_handle_t)mrfp);
+	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next) {
+		ASSERT(cnt < MAX_RINGS_PER_GROUP);
+		hwrh[cnt++] = (mac_ring_handle_t)ring;
+	}
+	*hwgh = (mac_group_handle_t)grp;
+	return (cnt);
 }
 
-mac_rx_handle_t
-mac_rx_add(mac_handle_t mh, mac_rx_t rx, void *arg)
+/*
+ * Setup the RX callback of the mac client which exclusively controls HW ring.
+ */
+void
+mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh)
 {
-	return (mac_do_rx_add(mh, rx, arg, B_FALSE));
+	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
+	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
+
+	mac_srs->srs_mrh = prh;
+	mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
 }
 
-mac_rx_handle_t
-mac_active_rx_add(mac_handle_t mh, mac_rx_t rx, void *arg)
+void
+mac_hwring_teardown(mac_ring_handle_t hwrh)
 {
-	return (mac_do_rx_add(mh, rx, arg, B_TRUE));
+	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
+	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
+
+	mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
+	mac_srs->srs_mrh = NULL;
 }
 
-/*
- * Unregister a receive function for this mac.
- * This function does not block if wait is B_FALSE. This is useful
- * for clients who call mac_rx_remove() from a non-blockable context.
- * More information on this function's interaction with mac_rx()
- * can be found atop mac_rx().
- */
-void
-mac_rx_remove(mac_handle_t mh, mac_rx_handle_t mrh, boolean_t wait)
+int
+mac_hwring_disable_intr(mac_ring_handle_t rh)
 {
-	mac_impl_t		*mip = (mac_impl_t *)mh;
-	mac_rx_fn_t		*mrfp = (mac_rx_fn_t *)mrh;
-	mac_rx_fn_t		**pp;
-	mac_rx_fn_t		*p;
+	mac_ring_t *rr_ring = (mac_ring_t *)rh;
+	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
 
-	/*
-	 * Search the 'rx' callback list for the function closure.
-	 */
-	rw_enter(&mip->mi_rx_lock, RW_WRITER);
-	for (pp = &(mip->mi_mrfp); (p = *pp) != NULL; pp = &(p->mrf_nextp)) {
-		if (p == mrfp)
-			break;
-	}
-	ASSERT(p != NULL);
+	return (intr->mi_disable(intr->mi_handle));
+}
 
-	/*
-	 * If mac_rx() is running, mark callback for deletion
-	 * and return (if wait is false), or wait until mac_rx()
-	 * exits (if wait is true).
-	 */
-	if (mip->mi_rx_ref > 0) {
-		DTRACE_PROBE1(defer_delete, mac_impl_t *, mip);
-		p->mrf_inuse = B_FALSE;
-		mutex_enter(&mip->mi_lock);
-		mip->mi_rx_removed++;
-		mutex_exit(&mip->mi_lock);
+int
+mac_hwring_enable_intr(mac_ring_handle_t rh)
+{
+	mac_ring_t *rr_ring = (mac_ring_t *)rh;
+	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
 
-		rw_exit(&mip->mi_rx_lock);
-		if (wait)
-			mac_rx_remove_wait(mh);
-		return;
-	}
+	return (intr->mi_enable(intr->mi_handle));
+}
+
+int
+mac_hwring_start(mac_ring_handle_t rh)
+{
+	mac_ring_t *rr_ring = (mac_ring_t *)rh;
 
-	/* Remove it from the list. */
-	*pp = p->mrf_nextp;
-	kmem_free(mrfp, sizeof (mac_rx_fn_t));
-	rw_exit(&mip->mi_rx_lock);
+	MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
+	return (0);
 }
 
-/*
- * Wait for all pending callback removals to be completed by mac_rx().
- * Note that if we call mac_rx_remove() immediately before this, there is no
- * guarantee we would wait *only* on the callback that we specified.
- * mac_rx_remove() could have been called by other threads and we would have
- * to wait for other marked callbacks to be removed as well.
- */
 void
-mac_rx_remove_wait(mac_handle_t mh)
+mac_hwring_stop(mac_ring_handle_t rh)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
+	mac_ring_t *rr_ring = (mac_ring_t *)rh;
 
-	mutex_enter(&mip->mi_lock);
-	while (mip->mi_rx_removed > 0) {
-		DTRACE_PROBE1(need_wait, mac_impl_t *, mip);
-		cv_wait(&mip->mi_rx_cv, &mip->mi_lock);
-	}
-	mutex_exit(&mip->mi_lock);
+	mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
 }
 
-mac_txloop_handle_t
-mac_txloop_add(mac_handle_t mh, mac_txloop_t tx, void *arg)
+mblk_t *
+mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	mac_txloop_fn_t	*mtfp;
+	mac_ring_t *rr_ring = (mac_ring_t *)rh;
+	mac_ring_info_t *info = &rr_ring->mr_info;
 
-	mtfp = kmem_zalloc(sizeof (mac_txloop_fn_t), KM_SLEEP);
-	mtfp->mtf_fn = tx;
-	mtfp->mtf_arg = arg;
+	return (info->mri_poll(info->mri_driver, bytes_to_pickup));
+}
 
-	/*
-	 * Add it to the head of the 'tx' callback list.
-	 */
-	rw_enter(&(mip->mi_tx_lock), RW_WRITER);
-	mtfp->mtf_nextp = mip->mi_mtfp;
-	mip->mi_mtfp = mtfp;
-	rw_exit(&(mip->mi_tx_lock));
+int
+mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
+{
+	mac_group_t *group = (mac_group_t *)gh;
 
-	return ((mac_txloop_handle_t)mtfp);
+	return (mac_group_addmac(group, addr));
+}
+
+int
+mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
+{
+	mac_group_t *group = (mac_group_t *)gh;
+
+	return (mac_group_remmac(group, addr));
 }
 
 /*
- * Unregister a transmit function for this mac.  This removes the function
- * from the list of transmit functions for this mac.
+ * Set the RX group to be shared/reserved. Note that the group must be
+ * started/stopped outside of this function.
  */
 void
-mac_txloop_remove(mac_handle_t mh, mac_txloop_handle_t mth)
+mac_set_rx_group_state(mac_group_t *grp, mac_group_state_t state)
 {
-	mac_impl_t		*mip = (mac_impl_t *)mh;
-	mac_txloop_fn_t		*mtfp = (mac_txloop_fn_t *)mth;
-	mac_txloop_fn_t		**pp;
-	mac_txloop_fn_t		*p;
-
 	/*
-	 * Search the 'tx' callback list for the function.
+	 * If there is no change in the group state, just return.
 	 */
-	rw_enter(&(mip->mi_tx_lock), RW_WRITER);
-	for (pp = &(mip->mi_mtfp); (p = *pp) != NULL; pp = &(p->mtf_nextp)) {
-		if (p == mtfp)
-			break;
+	if (grp->mrg_state == state)
+		return;
+
+	switch (state) {
+	case MAC_GROUP_STATE_RESERVED:
+		/*
+		 * Successfully reserved the group.
+		 *
+		 * Given that there is an exclusive client controlling this
+		 * group, we enable the group level polling when available,
+		 * so that SRSs get to turn on/off individual rings they's
+		 * assigned to.
+		 */
+		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
+
+		if (GROUP_INTR_DISABLE_FUNC(grp) != NULL)
+			GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
+
+		break;
+
+	case MAC_GROUP_STATE_SHARED:
+		/*
+		 * Set all rings of this group to software classified.
+		 * If the group has an overriding interrupt, then re-enable it.
+		 */
+		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
+
+		if (GROUP_INTR_ENABLE_FUNC(grp) != NULL)
+			GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
+
+		/* The ring is not available for reservations any more */
+		break;
+
+	case MAC_GROUP_STATE_REGISTERED:
+		/* Also callable from mac_register, perim is not held */
+		break;
+
+	default:
+		ASSERT(B_FALSE);
+		break;
 	}
-	ASSERT(p != NULL);
 
-	/* Remove it from the list. */
-	*pp = p->mtf_nextp;
-	kmem_free(mtfp, sizeof (mac_txloop_fn_t));
-	rw_exit(&(mip->mi_tx_lock));
+	grp->mrg_state = state;
 }
 
-void
-mac_resource_set(mac_handle_t mh, mac_resource_add_t add, void *arg)
+/*
+ * Quiesce future hardware classified packets for the specified Rx ring
+ */
+static void
+mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
 {
-	mac_impl_t		*mip = (mac_impl_t *)mh;
-
-	/*
-	 * Update the 'resource_add' callbacks.
-	 */
-	rw_enter(&(mip->mi_resource_lock), RW_WRITER);
-	mip->mi_resource_add = add;
-	mip->mi_resource_add_arg = arg;
-	rw_exit(&(mip->mi_resource_lock));
+	ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
+	ASSERT(ring_flag == MR_CONDEMNED || ring_flag  == MR_QUIESCE);
+
+	mutex_enter(&rx_ring->mr_lock);
+	rx_ring->mr_flag |= ring_flag;
+	while (rx_ring->mr_refcnt != 0)
+		cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
+	mutex_exit(&rx_ring->mr_lock);
 }
 
 /*
- * Driver support functions.
+ * Please see mac_tx for details about the per cpu locking scheme
  */
-
-mac_register_t *
-mac_alloc(uint_t mac_version)
+static void
+mac_tx_lock_all(mac_client_impl_t *mcip)
 {
-	mac_register_t *mregp;
+	int	i;
 
-	/*
-	 * Make sure there isn't a version mismatch between the driver and
-	 * the framework.  In the future, if multiple versions are
-	 * supported, this check could become more sophisticated.
-	 */
-	if (mac_version != MAC_VERSION)
-		return (NULL);
-
-	mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP);
-	mregp->m_version = mac_version;
-	return (mregp);
+	for (i = 0; i <= mac_tx_percpu_cnt; i++)
+		mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
 }
 
-void
-mac_free(mac_register_t *mregp)
+static void
+mac_tx_unlock_all(mac_client_impl_t *mcip)
 {
-	kmem_free(mregp, sizeof (mac_register_t));
+	int	i;
+
+	for (i = mac_tx_percpu_cnt; i >= 0; i--)
+		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
 }
 
-/*
- * Allocate a minor number.
- */
-minor_t
-mac_minor_hold(boolean_t sleep)
+static void
+mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
 {
-	minor_t	minor;
+	int	i;
 
-	/*
-	 * Grab a value from the arena.
-	 */
-	atomic_add_32(&minor_count, 1);
+	for (i = mac_tx_percpu_cnt; i > 0; i--)
+		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
+}
 
-	if (sleep)
-		minor = (uint_t)id_alloc(minor_ids);
-	else
-		minor = (uint_t)id_alloc_nosleep(minor_ids);
+static int
+mac_tx_sum_refcnt(mac_client_impl_t *mcip)
+{
+	int	i;
+	int	refcnt = 0;
 
-	if (minor == 0) {
-		atomic_add_32(&minor_count, -1);
-		return (0);
-	}
+	for (i = 0; i <= mac_tx_percpu_cnt; i++)
+		refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
 
-	return (minor);
+	return (refcnt);
 }
 
 /*
- * Release a previously allocated minor number.
+ * Stop future Tx packets coming down from the client in preparation for
+ * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
+ * of rings between clients
  */
 void
-mac_minor_rele(minor_t minor)
+mac_tx_client_block(mac_client_impl_t *mcip)
 {
-	/*
-	 * Return the value to the arena.
-	 */
-	id_free(minor_ids, minor);
-	atomic_add_32(&minor_count, -1);
+	mac_tx_lock_all(mcip);
+	mcip->mci_tx_flag |= MCI_TX_QUIESCE;
+	while (mac_tx_sum_refcnt(mcip) != 0) {
+		mac_tx_unlock_allbutzero(mcip);
+		cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
+		mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
+		mac_tx_lock_all(mcip);
+	}
+	mac_tx_unlock_all(mcip);
 }
 
-uint32_t
-mac_no_notification(mac_handle_t mh)
+void
+mac_tx_client_unblock(mac_client_impl_t *mcip)
 {
-	mac_impl_t *mip = (mac_impl_t *)mh;
-	return (mip->mi_unsup_note);
+	mac_tx_lock_all(mcip);
+	mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
+	mac_tx_unlock_all(mcip);
 }
 
-boolean_t
-mac_is_legacy(mac_handle_t mh)
+/*
+ * Wait for an SRS to quiesce. The SRS worker will signal us when the
+ * quiesce is done.
+ */
+static void
+mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
 {
-	mac_impl_t *mip = (mac_impl_t *)mh;
-	return (mip->mi_legacy);
+	mutex_enter(&srs->srs_lock);
+	while (!(srs->srs_state & srs_flag))
+		cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
+	mutex_exit(&srs->srs_lock);
 }
 
 /*
- * mac_register() is how drivers register new MACs with the GLDv3
- * framework.  The mregp argument is allocated by drivers using the
- * mac_alloc() function, and can be freed using mac_free() immediately upon
- * return from mac_register().  Upon success (0 return value), the mhp
- * opaque pointer becomes the driver's handle to its MAC interface, and is
- * the argument to all other mac module entry points.
+ * Quiescing an Rx SRS is achieved by the following sequence. The protocol
+ * works bottom up by cutting off packet flow from the bottommost point in the
+ * mac, then the SRS, and then the soft rings. There are 2 use cases of this
+ * mechanism. One is a temporary quiesce of the SRS, such as say while changing
+ * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
+ * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
+ * for the SRS and MR flags. In the former case the threads pause waiting for
+ * a restart, while in the latter case the threads exit. The Tx SRS teardown
+ * is also mostly similar to the above.
+ *
+ * 1. Stop future hardware classified packets at the lowest level in the mac.
+ *    Remove any hardware classification rule (CONDEMNED case) and mark the
+ *    rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
+ *    from increasing. Upcalls from the driver that come through hardware
+ *    classification will be dropped in mac_rx from now on. Then we wait for
+ *    the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
+ *    sure there aren't any upcall threads from the driver through hardware
+ *    classification. In the case of SRS teardown we also remove the
+ *    classification rule in the driver.
+ *
+ * 2. Stop future software classified packets by marking the flow entry with
+ *    FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
+ *    increasing. We also remove the flow entry from the table in the latter
+ *    case. Then wait for the fe_refcnt to reach an appropriate quiescent value
+ *    that indicates there aren't any active threads using that flow entry.
+ *
+ * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
+ *    SRS worker thread, and the soft ring threads are quiesced in sequence
+ *    with the SRS worker thread serving as a master controller. This
+ *    mechansim is explained in mac_srs_worker_quiesce().
+ *
+ * The restart mechanism to reactivate the SRS and softrings is explained
+ * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
+ * restart sequence.
  */
-int
-mac_register(mac_register_t *mregp, mac_handle_t *mhp)
-{
-	mac_impl_t		*mip;
-	mactype_t		*mtype;
-	int			err = EINVAL;
-	struct devnames		*dnp = NULL;
-	uint_t			instance;
-	boolean_t		style1_created = B_FALSE;
-	boolean_t		style2_created = B_FALSE;
-	mac_capab_legacy_t	legacy;
-	char			*driver;
-	minor_t			minor = 0;
-
-	/* Find the required MAC-Type plugin. */
-	if ((mtype = i_mactype_getplugin(mregp->m_type_ident)) == NULL)
-		return (EINVAL);
+void
+mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
+{
+	flow_entry_t	*flent = srs->srs_flent;
+	uint_t	mr_flag, srs_done_flag;
 
-	/* Create a mac_impl_t to represent this MAC. */
-	mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP);
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
+	ASSERT(!(srs->srs_type & SRST_TX));
 
-	/*
-	 * The mac is not ready for open yet.
-	 */
-	mip->mi_disabled = B_TRUE;
-
-	/*
-	 * When a mac is registered, the m_instance field can be set to:
-	 *
-	 *  0:	Get the mac's instance number from m_dip.
-	 *	This is usually used for physical device dips.
-	 *
-	 *  [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number.
-	 *	For example, when an aggregation is created with the key option,
-	 *	"key" will be used as the instance number.
-	 *
-	 *  -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1].
-	 *	This is often used when a MAC of a virtual link is registered
-	 *	(e.g., aggregation when "key" is not specified, or vnic).
-	 *
-	 * Note that the instance number is used to derive the mi_minor field
-	 * of mac_impl_t, which will then be used to derive the name of kstats
-	 * and the devfs nodes.  The first 2 cases are needed to preserve
-	 * backward compatibility.
-	 */
-	switch (mregp->m_instance) {
-	case 0:
-		instance = ddi_get_instance(mregp->m_dip);
-		break;
-	case ((uint_t)-1):
-		minor = mac_minor_hold(B_TRUE);
-		if (minor == 0) {
-			err = ENOSPC;
-			goto fail;
-		}
-		instance = minor - 1;
-		break;
-	default:
-		instance = mregp->m_instance;
-		if (instance >= MAC_MAX_MINOR) {
-			err = EINVAL;
-			goto fail;
-		}
-		break;
+	if (srs_quiesce_flag == SRS_CONDEMNED) {
+		mr_flag = MR_CONDEMNED;
+		srs_done_flag = SRS_CONDEMNED_DONE;
+		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
+			mac_srs_client_poll_disable(srs->srs_mcip, srs);
+	} else {
+		ASSERT(srs_quiesce_flag == SRS_QUIESCE);
+		mr_flag = MR_QUIESCE;
+		srs_done_flag = SRS_QUIESCE_DONE;
+		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
+			mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
 	}
 
-	mip->mi_minor = (minor_t)(instance + 1);
-	mip->mi_dip = mregp->m_dip;
-
-	driver = (char *)ddi_driver_name(mip->mi_dip);
-
-	/* Construct the MAC name as <drvname><instance> */
-	(void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d",
-	    driver, instance);
-
-	mip->mi_driver = mregp->m_driver;
-
-	mip->mi_type = mtype;
-	mip->mi_margin = mregp->m_margin;
-	mip->mi_info.mi_media = mtype->mt_type;
-	mip->mi_info.mi_nativemedia = mtype->mt_nativetype;
-	if (mregp->m_max_sdu <= mregp->m_min_sdu)
-		goto fail;
-	mip->mi_sdu_min = mregp->m_min_sdu;
-	mip->mi_sdu_max = mregp->m_max_sdu;
-	mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length;
-	/*
-	 * If the media supports a broadcast address, cache a pointer to it
-	 * in the mac_info_t so that upper layers can use it.
-	 */
-	mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr;
-
-	/*
-	 * Copy the unicast source address into the mac_info_t, but only if
-	 * the MAC-Type defines a non-zero address length.  We need to
-	 * handle MAC-Types that have an address length of 0
-	 * (point-to-point protocol MACs for example).
-	 */
-	if (mip->mi_type->mt_addr_length > 0) {
-		if (mregp->m_src_addr == NULL)
-			goto fail;
-		mip->mi_info.mi_unicst_addr =
-		    kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP);
-		bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr,
-		    mip->mi_type->mt_addr_length);
-
+	if (srs->srs_ring != NULL) {
+		mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
+	} else {
 		/*
-		 * Copy the fixed 'factory' MAC address from the immutable
-		 * info.  This is taken to be the MAC address currently in
-		 * use.
+		 * SRS is driven by software classification. In case
+		 * of CONDEMNED, the top level teardown functions will
+		 * deal with flow removal.
 		 */
-		bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr,
-		    mip->mi_type->mt_addr_length);
-		/* Copy the destination address if one is provided. */
-		if (mregp->m_dst_addr != NULL) {
-			bcopy(mregp->m_dst_addr, mip->mi_dstaddr,
-			    mip->mi_type->mt_addr_length);
+		if (srs_quiesce_flag != SRS_CONDEMNED) {
+			FLOW_MARK(flent, FE_QUIESCE);
+			mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
 		}
-	} else if (mregp->m_src_addr != NULL) {
-		goto fail;
 	}
 
 	/*
-	 * The format of the m_pdata is specific to the plugin.  It is
-	 * passed in as an argument to all of the plugin callbacks.  The
-	 * driver can update this information by calling
-	 * mac_pdata_update().
+	 * Signal the SRS to quiesce itself, and then cv_wait for the
+	 * SRS quiesce to complete. The SRS worker thread will wake us
+	 * up when the quiesce is complete
 	 */
-	if (mregp->m_pdata != NULL) {
-		/*
-		 * Verify that the plugin supports MAC plugin data and that
-		 * the supplied data is valid.
-		 */
-		if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
-			goto fail;
-		if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata,
-		    mregp->m_pdata_size)) {
-			goto fail;
-		}
-		mip->mi_pdata = kmem_alloc(mregp->m_pdata_size, KM_SLEEP);
-		bcopy(mregp->m_pdata, mip->mi_pdata, mregp->m_pdata_size);
-		mip->mi_pdata_size = mregp->m_pdata_size;
-	}
+	mac_srs_signal(srs, srs_quiesce_flag);
+	mac_srs_quiesce_wait(srs, srs_done_flag);
+}
 
-	/*
-	 * Register the private properties.
-	 */
-	mac_register_priv_prop(mip, mregp->m_priv_props,
-	    mregp->m_priv_prop_count);
+/*
+ * Remove an SRS.
+ */
+void
+mac_rx_srs_remove(mac_soft_ring_set_t *srs)
+{
+	flow_entry_t *flent = srs->srs_flent;
+	int i;
 
+	mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
 	/*
-	 * Stash the driver callbacks into the mac_impl_t, but first sanity
-	 * check to make sure all mandatory callbacks are set.
+	 * Locate and remove our entry in the fe_rx_srs[] array, and
+	 * adjust the fe_rx_srs array entries and array count by
+	 * moving the last entry into the vacated spot.
 	 */
-	if (mregp->m_callbacks->mc_getstat == NULL ||
-	    mregp->m_callbacks->mc_start == NULL ||
-	    mregp->m_callbacks->mc_stop == NULL ||
-	    mregp->m_callbacks->mc_setpromisc == NULL ||
-	    mregp->m_callbacks->mc_multicst == NULL ||
-	    mregp->m_callbacks->mc_unicst == NULL ||
-	    mregp->m_callbacks->mc_tx == NULL) {
-		goto fail;
+	mutex_enter(&flent->fe_lock);
+	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
+		if (flent->fe_rx_srs[i] == srs)
+			break;
 	}
-	mip->mi_callbacks = mregp->m_callbacks;
 
-	/*
-	 * Set up the possible transmit routines.
-	 */
-	mip->mi_txinfo.mt_fn = mip->mi_tx;
-	mip->mi_txinfo.mt_arg = mip->mi_driver;
+	ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
+	if (i != flent->fe_rx_srs_cnt - 1) {
+		flent->fe_rx_srs[i] =
+		    flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
+		i = flent->fe_rx_srs_cnt - 1;
+	}
 
-	mip->mi_legacy = mac_capab_get((mac_handle_t)mip,
-	    MAC_CAPAB_LEGACY, &legacy);
+	flent->fe_rx_srs[i] = NULL;
+	flent->fe_rx_srs_cnt--;
+	mutex_exit(&flent->fe_lock);
 
-	if (mip->mi_legacy) {
-		/*
-		 * Legacy device. Messages being sent will be looped back
-		 * by the underlying driver. Therefore the txloop function
-		 * pointer is the same as the tx function pointer.
-		 */
-		mip->mi_txloopinfo.mt_fn = mip->mi_txinfo.mt_fn;
-		mip->mi_txloopinfo.mt_arg = mip->mi_txinfo.mt_arg;
-		mip->mi_unsup_note = legacy.ml_unsup_note;
-		mip->mi_phy_dev = legacy.ml_dev;
-	} else {
-		/*
-		 * Normal device. The framework needs to do the loopback.
-		 */
-		mip->mi_txloopinfo.mt_fn = mac_txloop;
-		mip->mi_txloopinfo.mt_arg = mip;
-		mip->mi_unsup_note = 0;
-		mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip),
-		    ddi_get_instance(mip->mi_dip) + 1);
-	}
+	mac_srs_free(srs);
+}
 
-	mip->mi_vnic_txinfo.mt_fn = mac_vnic_tx;
-	mip->mi_vnic_txinfo.mt_arg = mip;
+static void
+mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
+{
+	mutex_enter(&srs->srs_lock);
+	srs->srs_state &= ~flag;
+	mutex_exit(&srs->srs_lock);
+}
+
+void
+mac_rx_srs_restart(mac_soft_ring_set_t *srs)
+{
+	flow_entry_t	*flent = srs->srs_flent;
+	mac_ring_t	*mr;
 
-	mip->mi_vnic_txloopinfo.mt_fn = mac_vnic_txloop;
-	mip->mi_vnic_txloopinfo.mt_arg = mip;
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
+	ASSERT((srs->srs_type & SRST_TX) == 0);
 
 	/*
-	 * Allocate a notification thread.
+	 * This handles a change in the number of SRSs between the quiesce and
+	 * and restart operation of a flow.
 	 */
-	mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread,
-	    mip, 0, &p0, TS_RUN, minclsyspri);
-	if (mip->mi_notify_thread == NULL)
-		goto fail;
+	if (!SRS_QUIESCED(srs))
+		return;
 
 	/*
-	 * Initialize the kstats for this device.
+	 * Signal the SRS to restart itself. Wait for the restart to complete
+	 * Note that we only restart the SRS if it is not marked as
+	 * permanently quiesced.
 	 */
-	mac_stat_create(mip);
-
-
-	/* set the gldv3 flag in dn_flags */
-	dnp = &devnamesp[ddi_driver_major(mip->mi_dip)];
-	LOCK_DEV_OPS(&dnp->dn_lock);
-	dnp->dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER);
-	UNLOCK_DEV_OPS(&dnp->dn_lock);
-
-	if (mip->mi_minor < MAC_MAX_MINOR + 1) {
-		/* Create a style-2 DLPI device */
-		if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0,
-		    DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS)
-			goto fail;
-		style2_created = B_TRUE;
+	if (!SRS_QUIESCED_PERMANENT(srs)) {
+		mac_srs_signal(srs, SRS_RESTART);
+		mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
+		mac_srs_clear_flag(srs, SRS_RESTART_DONE);
 
-		/* Create a style-1 DLPI device */
-		if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR,
-		    mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS)
-			goto fail;
-		style1_created = B_TRUE;
+		mac_srs_client_poll_restart(srs->srs_mcip, srs);
 	}
 
-	rw_enter(&i_mac_impl_lock, RW_WRITER);
-	if (mod_hash_insert(i_mac_impl_hash,
-	    (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) {
-
-		rw_exit(&i_mac_impl_lock);
-		err = EEXIST;
-		goto fail;
+	/* Finally clear the flags to let the packets in */
+	mr = srs->srs_ring;
+	if (mr != NULL) {
+		MAC_RING_UNMARK(mr, MR_QUIESCE);
+		/* In case the ring was stopped, safely restart it */
+		(void) mac_start_ring(mr);
+	} else {
+		FLOW_UNMARK(flent, FE_QUIESCE);
 	}
+}
 
-	DTRACE_PROBE2(mac__register, struct devnames *, dnp,
-	    (mac_impl_t *), mip);
-
-	/*
-	 * Mark the MAC to be ready for open.
-	 */
-	mip->mi_disabled = B_FALSE;
-
-	rw_exit(&i_mac_impl_lock);
-
-	atomic_inc_32(&i_mac_impl_count);
+/*
+ * Temporary quiesce of a flow and associated Rx SRS.
+ * Please see block comment above mac_rx_classify_flow_rem.
+ */
+/* ARGSUSED */
+int
+mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
+{
+	int		i;
 
-	cmn_err(CE_NOTE, "!%s registered", mip->mi_name);
-	*mhp = (mac_handle_t)mip;
+	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
+		mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
+		    SRS_QUIESCE);
+	}
 	return (0);
+}
 
-fail:
-	if (style1_created)
-		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
-
-	if (style2_created)
-		ddi_remove_minor_node(mip->mi_dip, driver);
+/*
+ * Restart a flow and associated Rx SRS that has been quiesced temporarily
+ * Please see block comment above mac_rx_classify_flow_rem
+ */
+/* ARGSUSED */
+int
+mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
+{
+	int		i;
 
-	/* clean up notification thread */
-	if (mip->mi_notify_thread != NULL) {
-		mutex_enter(&mip->mi_notify_bits_lock);
-		mip->mi_notify_bits = (1 << MAC_NNOTE);
-		cv_broadcast(&mip->mi_notify_cv);
-		while (mip->mi_notify_bits != 0)
-			cv_wait(&mip->mi_notify_cv, &mip->mi_notify_bits_lock);
-		mutex_exit(&mip->mi_notify_bits_lock);
-	}
+	for (i = 0; i < flent->fe_rx_srs_cnt; i++)
+		mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
 
-	if (mip->mi_info.mi_unicst_addr != NULL) {
-		kmem_free(mip->mi_info.mi_unicst_addr,
-		    mip->mi_type->mt_addr_length);
-		mip->mi_info.mi_unicst_addr = NULL;
-	}
+	return (0);
+}
 
-	mac_stat_destroy(mip);
+void
+mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	flow_entry_t		*flent = mcip->mci_flent;
+	mac_impl_t		*mip = mcip->mci_mip;
+	mac_soft_ring_set_t	*mac_srs;
+	int			i;
 
-	if (mip->mi_type != NULL) {
-		atomic_dec_32(&mip->mi_type->mt_ref);
-		mip->mi_type = NULL;
-	}
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
 
-	if (mip->mi_pdata != NULL) {
-		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
-		mip->mi_pdata = NULL;
-		mip->mi_pdata_size = 0;
-	}
+	if (flent == NULL)
+		return;
 
-	if (minor != 0) {
-		ASSERT(minor > MAC_MAX_MINOR);
-		mac_minor_rele(minor);
+	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
+		mac_srs = flent->fe_rx_srs[i];
+		mutex_enter(&mac_srs->srs_lock);
+		if (on)
+			mac_srs->srs_state |= SRS_QUIESCE_PERM;
+		else
+			mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
+		mutex_exit(&mac_srs->srs_lock);
 	}
-
-	mac_unregister_priv_prop(mip);
-
-	kmem_cache_free(i_mac_impl_cachep, mip);
-	return (err);
 }
 
-int
-mac_disable(mac_handle_t mh)
+void
+mac_rx_client_quiesce(mac_client_handle_t mch)
 {
-	mac_impl_t		*mip = (mac_impl_t *)mh;
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_impl_t		*mip = mcip->mci_mip;
 
-	/*
-	 * See if there are any other references to this mac_t (e.g., VLAN's).
-	 * If not, set mi_disabled to prevent any new VLAN's from being
-	 * created while we're destroying this mac.
-	 */
-	rw_enter(&i_mac_impl_lock, RW_WRITER);
-	if (mip->mi_ref > 0) {
-		rw_exit(&i_mac_impl_lock);
-		return (EBUSY);
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	if (MCIP_DATAPATH_SETUP(mcip)) {
+		(void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
+		    NULL);
+		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
+		    mac_rx_classify_flow_quiesce, NULL);
 	}
-	mip->mi_disabled = B_TRUE;
-	rw_exit(&i_mac_impl_lock);
-	return (0);
 }
 
-int
-mac_unregister(mac_handle_t mh)
+void
+mac_rx_client_restart(mac_client_handle_t mch)
 {
-	int			err;
-	mac_impl_t		*mip = (mac_impl_t *)mh;
-	mod_hash_val_t		val;
-	mac_multicst_addr_t	*p, *nextp;
-	mac_margin_req_t	*mmr, *nextmmr;
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_impl_t		*mip = mcip->mci_mip;
 
-	/*
-	 * See if there are any other references to this mac_t (e.g., VLAN's).
-	 * If not, set mi_disabled to prevent any new VLAN's from being
-	 * created while we're destroying this mac. Once mac_disable() returns
-	 * 0, the rest of mac_unregister() stuff should continue without
-	 * returning an error.
-	 */
-	if (!mip->mi_disabled) {
-		if ((err = mac_disable(mh)) != 0)
-			return (err);
-	}
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
 
-	/*
-	 * Clean up notification thread (wait for it to exit).
-	 */
-	mutex_enter(&mip->mi_notify_bits_lock);
-	mip->mi_notify_bits = (1 << MAC_NNOTE);
-	cv_broadcast(&mip->mi_notify_cv);
-	while (mip->mi_notify_bits != 0)
-		cv_wait(&mip->mi_notify_cv, &mip->mi_notify_bits_lock);
-	mutex_exit(&mip->mi_notify_bits_lock);
-
-	if (mip->mi_minor < MAC_MAX_MINOR + 1) {
-		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
-		ddi_remove_minor_node(mip->mi_dip,
-		    (char *)ddi_driver_name(mip->mi_dip));
+	if (MCIP_DATAPATH_SETUP(mcip)) {
+		(void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
+		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
+		    mac_rx_classify_flow_restart, NULL);
 	}
+}
 
-	ASSERT(!mip->mi_activelink);
-
-	mac_stat_destroy(mip);
-
-	rw_enter(&i_mac_impl_lock, RW_WRITER);
-	(void) mod_hash_remove(i_mac_impl_hash,
-	    (mod_hash_key_t)mip->mi_name, &val);
-	ASSERT(mip == (mac_impl_t *)val);
+/*
+ * This function only quiesces the Tx SRS and softring worker threads. Callers
+ * need to make sure that there aren't any mac client threads doing current or
+ * future transmits in the mac before calling this function.
+ */
+void
+mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
+{
+	mac_client_impl_t	*mcip = srs->srs_mcip;
 
-	ASSERT(i_mac_impl_count > 0);
-	atomic_dec_32(&i_mac_impl_count);
-	rw_exit(&i_mac_impl_lock);
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
 
-	if (mip->mi_pdata != NULL)
-		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
-	mip->mi_pdata = NULL;
-	mip->mi_pdata_size = 0;
+	ASSERT(srs->srs_type & SRST_TX);
+	ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
+	    srs_quiesce_flag == SRS_QUIESCE);
 
 	/*
-	 * Free the list of multicast addresses.
+	 * Signal the SRS to quiesce itself, and then cv_wait for the
+	 * SRS quiesce to complete. The SRS worker thread will wake us
+	 * up when the quiesce is complete
 	 */
-	for (p = mip->mi_mmap; p != NULL; p = nextp) {
-		nextp = p->mma_nextp;
-		kmem_free(p, sizeof (mac_multicst_addr_t));
-	}
-	mip->mi_mmap = NULL;
+	mac_srs_signal(srs, srs_quiesce_flag);
+	mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
+	    SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
+}
 
+void
+mac_tx_srs_restart(mac_soft_ring_set_t *srs)
+{
 	/*
-	 * Free the list of margin request.
+	 * Resizing the fanout could result in creation of new SRSs.
+	 * They may not necessarily be in the quiesced state in which
+	 * case it need be restarted
 	 */
-	for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) {
-		nextmmr = mmr->mmr_nextp;
-		kmem_free(mmr, sizeof (mac_margin_req_t));
-	}
-	mip->mi_mmrp = NULL;
-
-	mip->mi_linkstate = LINK_STATE_UNKNOWN;
-	kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length);
-	mip->mi_info.mi_unicst_addr = NULL;
-
-	atomic_dec_32(&mip->mi_type->mt_ref);
-	mip->mi_type = NULL;
-
-	if (mip->mi_minor > MAC_MAX_MINOR)
-		mac_minor_rele(mip->mi_minor);
-
-	mac_unregister_priv_prop(mip);
-
-	cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name);
-
-	kmem_cache_free(i_mac_impl_cachep, mip);
+	if (!SRS_QUIESCED(srs))
+		return;
 
-	return (0);
+	mac_srs_signal(srs, SRS_RESTART);
+	mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
+	mac_srs_clear_flag(srs, SRS_RESTART_DONE);
 }
 
 /*
- * To avoid potential deadlocks, mac_rx() releases mi_rx_lock
- * before invoking its list of upcalls. This introduces races with
- * mac_rx_remove() and mac_rx_add(), who can potentially modify the
- * upcall list while mi_rx_lock is not being held. The race with
- * mac_rx_remove() is handled by incrementing mi_rx_ref upon entering
- * mac_rx(); a non-zero mi_rx_ref would tell mac_rx_remove()
- * to not modify the list but instead mark an upcall for deletion.
- * before mac_rx() exits, mi_rx_ref is decremented and if it
- * is 0, the marked upcalls will be removed from the list and freed.
- * The race with mac_rx_add() is harmless because mac_rx_add() only
- * prepends to the list and since mac_rx() saves the list head
- * before releasing mi_rx_lock, any prepended upcall won't be seen
- * until the next packet chain arrives.
- *
- * To minimize lock contention between multiple parallel invocations
- * of mac_rx(), mi_rx_lock is acquired as a READER lock. The
- * use of atomic operations ensures the sanity of mi_rx_ref. mi_rx_lock
- * will be upgraded to WRITER mode when there are marked upcalls to be
- * cleaned.
+ * Temporary quiesce of a flow and associated Rx SRS.
+ * Please see block comment above mac_rx_srs_quiesce
  */
-static void
-mac_do_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain,
-    boolean_t active_only)
+/* ARGSUSED */
+int
+mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	mblk_t		*bp = mp_chain;
-	mac_rx_fn_t	*mrfp;
-
 	/*
-	 * Call all registered receive functions.
+	 * The fe_tx_srs is null for a subflow on an interface that is
+	 * not plumbed
 	 */
-	rw_enter(&mip->mi_rx_lock, RW_READER);
-	if ((mrfp = mip->mi_mrfp) == NULL) {
-		/* There are no registered receive functions. */
-		freemsgchain(bp);
-		rw_exit(&mip->mi_rx_lock);
-		return;
-	}
-	atomic_inc_32(&mip->mi_rx_ref);
-	rw_exit(&mip->mi_rx_lock);
+	if (flent->fe_tx_srs != NULL)
+		mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
+	return (0);
+}
 
+/* ARGSUSED */
+int
+mac_tx_flow_restart(flow_entry_t *flent, void *arg)
+{
 	/*
-	 * Call registered receive functions.
+	 * The fe_tx_srs is null for a subflow on an interface that is
+	 * not plumbed
 	 */
-	do {
-		mblk_t *recv_bp;
-
-		if (active_only && !mrfp->mrf_active) {
-			mrfp = mrfp->mrf_nextp;
-			if (mrfp == NULL) {
-				/*
-				 * We hit the last receiver, but it's not
-				 * active.
-				 */
-				freemsgchain(bp);
-			}
-			continue;
-		}
-
-		recv_bp = (mrfp->mrf_nextp != NULL) ? copymsgchain(bp) : bp;
-		if (recv_bp != NULL) {
-			if (mrfp->mrf_inuse) {
-				/*
-				 * Send bp itself and keep the copy.
-				 * If there's only one active receiver,
-				 * it should get the original message,
-				 * tagged with the hardware checksum flags.
-				 */
-				mrfp->mrf_fn(mrfp->mrf_arg, mrh, bp);
-				bp = recv_bp;
-			} else {
-				freemsgchain(recv_bp);
-			}
-		}
-
-		mrfp = mrfp->mrf_nextp;
-	} while (mrfp != NULL);
+	if (flent->fe_tx_srs != NULL)
+		mac_tx_srs_restart(flent->fe_tx_srs);
+	return (0);
+}
 
-	rw_enter(&mip->mi_rx_lock, RW_READER);
-	if (atomic_dec_32_nv(&mip->mi_rx_ref) == 0 && mip->mi_rx_removed > 0) {
-		mac_rx_fn_t	**pp, *p;
-		uint32_t	cnt = 0;
+void
+mac_tx_client_quiesce(mac_client_impl_t *mcip, uint_t srs_quiesce_flag)
+{
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
 
-		DTRACE_PROBE1(delete_callbacks, mac_impl_t *, mip);
+	mac_tx_client_block(mcip);
+	if (MCIP_TX_SRS(mcip) != NULL) {
+		mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
+		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
+		    mac_tx_flow_quiesce, NULL);
+	}
+}
 
-		/*
-		 * Need to become exclusive before doing cleanup
-		 */
-		if (rw_tryupgrade(&mip->mi_rx_lock) == 0) {
-			rw_exit(&mip->mi_rx_lock);
-			rw_enter(&mip->mi_rx_lock, RW_WRITER);
-		}
+void
+mac_tx_client_restart(mac_client_impl_t *mcip)
+{
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
 
-		/*
-		 * We return if another thread has already entered and cleaned
-		 * up the list.
-		 */
-		if (mip->mi_rx_ref > 0 || mip->mi_rx_removed == 0) {
-			rw_exit(&mip->mi_rx_lock);
-			return;
-		}
+	mac_tx_client_unblock(mcip);
+	if (MCIP_TX_SRS(mcip) != NULL) {
+		mac_tx_srs_restart(MCIP_TX_SRS(mcip));
+		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
+		    mac_tx_flow_restart, NULL);
+	}
+}
 
-		/*
-		 * Free removed callbacks.
-		 */
-		pp = &mip->mi_mrfp;
-		while (*pp != NULL) {
-			if (!(*pp)->mrf_inuse) {
-				p = *pp;
-				*pp = (*pp)->mrf_nextp;
-				kmem_free(p, sizeof (*p));
-				cnt++;
-				continue;
-			}
-			pp = &(*pp)->mrf_nextp;
-		}
+void
+mac_tx_client_flush(mac_client_impl_t *mcip)
+{
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
 
-		/*
-		 * Wake up mac_rx_remove_wait()
-		 */
-		mutex_enter(&mip->mi_lock);
-		ASSERT(mip->mi_rx_removed == cnt);
-		mip->mi_rx_removed = 0;
-		cv_broadcast(&mip->mi_rx_cv);
-		mutex_exit(&mip->mi_lock);
-	}
-	rw_exit(&mip->mi_rx_lock);
+	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
+	mac_tx_client_restart(mcip);
 }
 
 void
-mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
+mac_client_quiesce(mac_client_impl_t *mcip)
 {
-	mac_do_rx(mh, mrh, mp_chain, B_FALSE);
+	mac_rx_client_quiesce((mac_client_handle_t)mcip);
+	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
 }
 
-/*
- * Send a packet chain up to the receive callbacks which declared
- * themselves as being active.
- */
 void
-mac_active_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp_chain)
+mac_client_restart(mac_client_impl_t *mcip)
 {
-	mac_do_rx(arg, mrh, mp_chain, B_TRUE);
+	mac_rx_client_restart((mac_client_handle_t)mcip);
+	mac_tx_client_restart(mcip);
 }
 
 /*
- * Function passed to the active client sharing a VNIC. This function
- * is returned by mac_tx_get() when a VNIC is present. It invokes
- * the VNIC transmit entry point which was specified by the VNIC when
- * it called mac_vnic_set(). The VNIC transmit entry point will
- * pass the packets to the local VNICs and/or to the underlying VNICs
- * if needed.
+ * Allocate a minor number.
  */
-static mblk_t *
-mac_vnic_tx(void *arg, mblk_t *mp)
+minor_t
+mac_minor_hold(boolean_t sleep)
 {
-	mac_impl_t	*mip = arg;
-	mac_txinfo_t	*mtfp;
-	mac_vnic_tx_t	*mvt;
+	minor_t	minor;
 
 	/*
-	 * There is a race between the notification of the VNIC
-	 * addition and removal, and the processing of the VNIC notification
-	 * by the MAC client. During this window, it is possible for
-	 * an active MAC client to contine invoking mac_vnic_tx() while
-	 * the VNIC has already been removed. So we cannot assume
-	 * that mi_vnic_present will always be true when mac_vnic_tx()
-	 * is invoked.
+	 * Grab a value from the arena.
 	 */
-	rw_enter(&mip->mi_tx_lock, RW_READER);
-	if (!mip->mi_vnic_present) {
-		rw_exit(&mip->mi_tx_lock);
-		freemsgchain(mp);
-		return (NULL);
-	}
+	atomic_add_32(&minor_count, 1);
 
-	ASSERT(mip->mi_vnic_tx != NULL);
-	mvt = mip->mi_vnic_tx;
-	MAC_VNIC_TXINFO_REFHOLD(mvt);
-	rw_exit(&mip->mi_tx_lock);
+	if (sleep)
+		minor = (uint_t)id_alloc(minor_ids);
+	else
+		minor = (uint_t)id_alloc_nosleep(minor_ids);
 
-	mtfp = &mvt->mv_txinfo;
-	mtfp->mt_fn(mtfp->mt_arg, mp);
+	if (minor == 0) {
+		atomic_add_32(&minor_count, -1);
+		return (0);
+	}
 
-	MAC_VNIC_TXINFO_REFRELE(mvt);
-	return (NULL);
+	return (minor);
 }
 
 /*
- * Transmit function -- ONLY used when there are registered loopback listeners.
+ * Release a previously allocated minor number.
  */
-mblk_t *
-mac_do_txloop(void *arg, mblk_t *bp, boolean_t call_vnic)
+void
+mac_minor_rele(minor_t minor)
 {
-	mac_impl_t	*mip = arg;
-	mac_txloop_fn_t	*mtfp;
-	mblk_t		*loop_bp, *resid_bp, *next_bp;
-
-	if (call_vnic) {
-		/*
-		 * In promiscous mode, a copy of the sent packet will
-		 * be sent to the client's promiscous receive entry
-		 * points via mac_vnic_tx()->
-		 * mac_active_rx_promisc()->mac_rx_default().
-		 */
-		return (mac_vnic_tx(arg, bp));
-	}
-
-	while (bp != NULL) {
-		next_bp = bp->b_next;
-		bp->b_next = NULL;
-
-		if ((loop_bp = copymsg(bp)) == NULL)
-			goto noresources;
-
-		if ((resid_bp = mip->mi_tx(mip->mi_driver, bp)) != NULL) {
-			ASSERT(resid_bp == bp);
-			freemsg(loop_bp);
-			goto noresources;
-		}
-
-		rw_enter(&mip->mi_tx_lock, RW_READER);
-		mtfp = mip->mi_mtfp;
-		while (mtfp != NULL && loop_bp != NULL) {
-			bp = loop_bp;
-
-			/* XXX counter bump if copymsg() fails? */
-			if (mtfp->mtf_nextp != NULL)
-				loop_bp = copymsg(bp);
-			else
-				loop_bp = NULL;
-
-			mtfp->mtf_fn(mtfp->mtf_arg, bp);
-			mtfp = mtfp->mtf_nextp;
-		}
-		rw_exit(&mip->mi_tx_lock);
-
-		/*
-		 * It's possible we've raced with the disabling of promiscuous
-		 * mode, in which case we can discard our copy.
-		 */
-		if (loop_bp != NULL)
-			freemsg(loop_bp);
-
-		bp = next_bp;
-	}
-
-	return (NULL);
-
-noresources:
-	bp->b_next = next_bp;
-	return (bp);
+	/*
+	 * Return the value to the arena.
+	 */
+	id_free(minor_ids, minor);
+	atomic_add_32(&minor_count, -1);
 }
 
-mblk_t *
-mac_txloop(void *arg, mblk_t *bp)
+uint32_t
+mac_no_notification(mac_handle_t mh)
 {
-	return (mac_do_txloop(arg, bp, B_FALSE));
+	mac_impl_t *mip = (mac_impl_t *)mh;
+	return (mip->mi_unsup_note);
 }
 
-static mblk_t *
-mac_vnic_txloop(void *arg, mblk_t *bp)
+/*
+ * Prevent any new opens of this mac in preparation for unregister
+ */
+int
+i_mac_disable(mac_impl_t *mip)
 {
-	return (mac_do_txloop(arg, bp, B_TRUE));
-}
+	mac_client_impl_t	*mcip;
 
-void
-mac_link_update(mac_handle_t mh, link_state_t link)
-{
-	mac_impl_t	*mip = (mac_impl_t *)mh;
+	rw_enter(&i_mac_impl_lock, RW_WRITER);
+	if (mip->mi_state_flags & MIS_DISABLED) {
+		/* Already disabled, return success */
+		rw_exit(&i_mac_impl_lock);
+		return (0);
+	}
+	/*
+	 * See if there are any other references to this mac_t (e.g., VLAN's).
+	 * If so return failure. If all the other checks below pass, then
+	 * set mi_disabled atomically under the i_mac_impl_lock to prevent
+	 * any new VLAN's from being created or new mac client opens of this
+	 * mac end point.
+	 */
+	if (mip->mi_ref > 0) {
+		rw_exit(&i_mac_impl_lock);
+		return (EBUSY);
+	}
 
 	/*
-	 * Save the link state.
+	 * mac clients must delete all multicast groups they join before
+	 * closing. bcast groups are reference counted, the last client
+	 * to delete the group will wait till the group is physically
+	 * deleted. Since all clients have closed this mac end point
+	 * mi_bcast_ngrps must be zero at this point
 	 */
-	mip->mi_linkstate = link;
+	ASSERT(mip->mi_bcast_ngrps == 0);
 
 	/*
-	 * Send a MAC_NOTE_LINK notification.
+	 * Don't let go of this if it has some flows.
+	 * All other code guarantees no flows are added to a disabled
+	 * mac, therefore it is sufficient to check for the flow table
+	 * only here.
 	 */
-	i_mac_notify(mip, MAC_NOTE_LINK);
+	mcip = mac_primary_client_handle(mip);
+	if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
+		rw_exit(&i_mac_impl_lock);
+		return (ENOTEMPTY);
+	}
+
+	mip->mi_state_flags |= MIS_DISABLED;
+	rw_exit(&i_mac_impl_lock);
+	return (0);
 }
 
-void
-mac_unicst_update(mac_handle_t mh, const uint8_t *addr)
+int
+mac_disable_nowait(mac_handle_t mh)
 {
 	mac_impl_t	*mip = (mac_impl_t *)mh;
+	int err;
 
-	if (mip->mi_type->mt_addr_length == 0)
-		return;
+	if ((err = i_mac_perim_enter_nowait(mip)) != 0)
+		return (err);
+	err = i_mac_disable(mip);
+	i_mac_perim_exit(mip);
+	return (err);
+}
 
-	/*
-	 * If the address has not changed, do nothing.
-	 */
-	if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) == 0)
-		return;
+int
+mac_disable(mac_handle_t mh)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+	int err;
 
-	/*
-	 * Save the address.
-	 */
-	bcopy(addr, mip->mi_addr, mip->mi_type->mt_addr_length);
+	i_mac_perim_enter(mip);
+	err = i_mac_disable(mip);
+	i_mac_perim_exit(mip);
 
 	/*
-	 * Send a MAC_NOTE_UNICST notification.
+	 * Clean up notification thread and wait for it to exit.
 	 */
-	i_mac_notify(mip, MAC_NOTE_UNICST);
-}
+	if (err == 0)
+		i_mac_notify_exit(mip);
 
-void
-mac_tx_update(mac_handle_t mh)
-{
-	/*
-	 * Send a MAC_NOTE_TX notification.
-	 */
-	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_TX);
+	return (err);
 }
 
-void
-mac_resource_update(mac_handle_t mh)
+/*
+ * Called when the MAC instance has a non empty flow table, to de-multiplex
+ * incoming packets to the right flow.
+ * The MAC's rw lock is assumed held as a READER.
+ */
+/* ARGSUSED */
+static mblk_t *
+mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
 {
+	flow_entry_t	*flent = NULL;
+	uint_t		flags = FLOW_INBOUND;
+	int		err;
+
 	/*
-	 * Send a MAC_NOTE_RESOURCE notification.
+	 * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
+	 * to mac_flow_lookup() so that the VLAN packets can be successfully
+	 * passed to the non-VLAN aggregation flows.
+	 *
+	 * Note that there is possibly a race between this and
+	 * mac_unicast_remove/add() and VLAN packets could be incorrectly
+	 * classified to non-VLAN flows of non-aggregation mac clients. These
+	 * VLAN packets will be then filtered out by the mac module.
 	 */
-	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_RESOURCE);
-}
-
-mac_resource_handle_t
-mac_resource_add(mac_handle_t mh, mac_resource_t *mrp)
-{
-	mac_impl_t		*mip = (mac_impl_t *)mh;
-	mac_resource_handle_t	mrh;
-	mac_resource_add_t	add;
-	void			*arg;
-
-	rw_enter(&mip->mi_resource_lock, RW_READER);
-	add = mip->mi_resource_add;
-	arg = mip->mi_resource_add_arg;
+	if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
+		flags |= FLOW_IGNORE_VLAN;
 
-	if (add != NULL)
-		mrh = add(arg, mrp);
-	else
-		mrh = NULL;
-	rw_exit(&mip->mi_resource_lock);
+	err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
+	if (err != 0) {
+		/* no registered receive function */
+		return (mp);
+	} else {
+		mac_client_impl_t	*mcip;
 
-	return (mrh);
+		/*
+		 * This flent might just be an additional one on the MAC client,
+		 * i.e. for classification purposes (different fdesc), however
+		 * the resources, SRS et. al., are in the mci_flent, so if
+		 * this isn't the mci_flent, we need to get it.
+		 */
+		if ((mcip = flent->fe_mcip) != NULL &&
+		    mcip->mci_flent != flent) {
+			FLOW_REFRELE(flent);
+			flent = mcip->mci_flent;
+			FLOW_TRY_REFHOLD(flent, err);
+			if (err != 0)
+				return (mp);
+		}
+		(flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
+		    B_FALSE);
+		FLOW_REFRELE(flent);
+	}
+	return (NULL);
 }
 
-int
-mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
+mblk_t *
+mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
 {
 	mac_impl_t	*mip = (mac_impl_t *)mh;
+	mblk_t		*bp, *bp1, **bpp, *list = NULL;
 
 	/*
-	 * Verify that the plugin supports MAC plugin data and that the
-	 * supplied data is valid.
+	 * We walk the chain and attempt to classify each packet.
+	 * The packets that couldn't be classified will be returned
+	 * back to the caller.
 	 */
-	if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
-		return (EINVAL);
-	if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize))
-		return (EINVAL);
+	bp = mp_chain;
+	bpp = &list;
+	while (bp != NULL) {
+		bp1 = bp;
+		bp = bp->b_next;
+		bp1->b_next = NULL;
 
-	if (mip->mi_pdata != NULL)
-		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
+		if (mac_rx_classify(mip, mrh, bp1) != NULL) {
+			*bpp = bp1;
+			bpp = &bp1->b_next;
+		}
+	}
+	return (list);
+}
 
-	mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP);
-	bcopy(mac_pdata, mip->mi_pdata, dsize);
-	mip->mi_pdata_size = dsize;
+static int
+mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
+{
+	mac_ring_handle_t ring = arg;
 
-	/*
-	 * Since the MAC plugin data is used to construct MAC headers that
-	 * were cached in fast-path headers, we need to flush fast-path
-	 * information for links associated with this mac.
-	 */
-	i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH);
+	if (flent->fe_tx_srs)
+		mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
 	return (0);
 }
 
 void
-mac_multicst_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
-    boolean_t add)
+i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
 {
-	mac_impl_t		*mip = (mac_impl_t *)mh;
-	mac_multicst_addr_t	*p;
+	mac_client_impl_t	*cclient;
+	mac_soft_ring_set_t	*mac_srs;
 
 	/*
-	 * If no specific refresh function was given then default to the
-	 * driver's m_multicst entry point.
+	 * After grabbing the mi_rw_lock, the list of clients can't change.
+	 * If there are any clients mi_disabled must be B_FALSE and can't
+	 * get set since there are clients. If there aren't any clients we
+	 * don't do anything. In any case the mip has to be valid. The driver
+	 * must make sure that it goes single threaded (with respect to mac
+	 * calls) and wait for all pending mac calls to finish before calling
+	 * mac_unregister.
 	 */
-	if (refresh == NULL) {
-		refresh = mip->mi_multicst;
-		arg = mip->mi_driver;
+	rw_enter(&i_mac_impl_lock, RW_READER);
+	if (mip->mi_state_flags & MIS_DISABLED) {
+		rw_exit(&i_mac_impl_lock);
+		return;
 	}
-	ASSERT(refresh != NULL);
 
 	/*
-	 * Walk the multicast address list and call the refresh function for
-	 * each address.
+	 * Get MAC tx srs from walking mac_client_handle list.
 	 */
-	rw_enter(&(mip->mi_data_lock), RW_READER);
-	for (p = mip->mi_mmap; p != NULL; p = p->mma_nextp)
-		refresh(arg, add, p->mma_addr);
-	rw_exit(&(mip->mi_data_lock));
+	rw_enter(&mip->mi_rw_lock, RW_READER);
+	for (cclient = mip->mi_clients_list; cclient != NULL;
+	    cclient = cclient->mci_client_next) {
+		if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL)
+			mac_tx_srs_wakeup(mac_srs, ring);
+		if (!FLOW_TAB_EMPTY(cclient->mci_subflow_tab)) {
+			(void) mac_flow_walk_nolock(cclient->mci_subflow_tab,
+			    mac_tx_flow_srs_wakeup, ring);
+		}
+	}
+	rw_exit(&mip->mi_rw_lock);
+	rw_exit(&i_mac_impl_lock);
 }
 
+/* ARGSUSED */
 void
-mac_unicst_refresh(mac_handle_t mh, mac_unicst_t refresh, void *arg)
+mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
+    boolean_t add)
 {
-	mac_impl_t	*mip = (mac_impl_t *)mh;
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	i_mac_perim_enter((mac_impl_t *)mh);
 	/*
 	 * If no specific refresh function was given then default to the
-	 * driver's mi_unicst entry point.
+	 * driver's m_multicst entry point.
 	 */
 	if (refresh == NULL) {
-		refresh = mip->mi_unicst;
+		refresh = mip->mi_multicst;
 		arg = mip->mi_driver;
 	}
-	ASSERT(refresh != NULL);
 
-	/*
-	 * Call the refresh function with the current unicast address.
-	 */
-	refresh(arg, mip->mi_addr);
+	mac_bcast_refresh(mip, refresh, arg, add);
+	i_mac_perim_exit((mac_impl_t *)mh);
 }
 
 void
@@ -2352,7 +2416,7 @@ mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
 	mac_margin_req_t	**pp, *p;
 	int			err = 0;
 
-	rw_enter(&(mip->mi_data_lock), RW_WRITER);
+	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
 	if (current)
 		*marginp = mip->mi_margin;
 
@@ -2369,7 +2433,7 @@ mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
 	 * Check whether the given margin is already in the list. If so,
 	 * bump the reference count.
 	 */
-	for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
+	for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
 		if (p->mmr_margin == *marginp) {
 			/*
 			 * The margin requested is already in the list,
@@ -2383,18 +2447,14 @@ mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
 	}
 
 
-	if ((p = kmem_zalloc(sizeof (mac_margin_req_t), KM_NOSLEEP)) == NULL) {
-		err = ENOMEM;
-		goto done;
-	}
-
+	p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
 	p->mmr_margin = *marginp;
 	p->mmr_ref++;
 	p->mmr_nextp = *pp;
 	*pp = p;
 
 done:
-	rw_exit(&(mip->mi_data_lock));
+	rw_exit(&(mip->mi_rw_lock));
 	return (err);
 }
 
@@ -2409,7 +2469,7 @@ mac_margin_remove(mac_handle_t mh, uint32_t margin)
 	mac_margin_req_t	**pp, *p;
 	int			err = 0;
 
-	rw_enter(&(mip->mi_data_lock), RW_WRITER);
+	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
 	/*
 	 * Find the entry in the list for the given margin.
 	 */
@@ -2442,30 +2502,17 @@ mac_margin_remove(mac_handle_t mh, uint32_t margin)
 	*pp = p->mmr_nextp;
 	kmem_free(p, sizeof (mac_margin_req_t));
 done:
-	rw_exit(&(mip->mi_data_lock));
+	rw_exit(&(mip->mi_rw_lock));
 	return (err);
 }
 
-/*
- * The mac client requests to get the mac's current margin value.
- */
-void
-mac_margin_get(mac_handle_t mh, uint32_t *marginp)
-{
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-
-	rw_enter(&(mip->mi_data_lock), RW_READER);
-	*marginp = mip->mi_margin;
-	rw_exit(&(mip->mi_data_lock));
-}
-
 boolean_t
 mac_margin_update(mac_handle_t mh, uint32_t margin)
 {
 	mac_impl_t	*mip = (mac_impl_t *)mh;
 	uint32_t	margin_needed = 0;
 
-	rw_enter(&(mip->mi_data_lock), RW_WRITER);
+	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
 
 	if (mip->mi_mmrp != NULL)
 		margin_needed = mip->mi_mmrp->mmr_margin;
@@ -2473,7 +2520,7 @@ mac_margin_update(mac_handle_t mh, uint32_t margin)
 	if (margin_needed <= margin)
 		mip->mi_margin = margin;
 
-	rw_exit(&(mip->mi_data_lock));
+	rw_exit(&(mip->mi_rw_lock));
 
 	if (margin_needed <= margin)
 		i_mac_notify(mip, MAC_NOTE_MARGIN);
@@ -2481,287 +2528,48 @@ mac_margin_update(mac_handle_t mh, uint32_t margin)
 	return (margin_needed <= margin);
 }
 
-boolean_t
-mac_do_active_set(mac_handle_t mh, boolean_t shareable)
-{
-	mac_impl_t *mip = (mac_impl_t *)mh;
-
-	mutex_enter(&mip->mi_activelink_lock);
-	if (mip->mi_activelink) {
-		mutex_exit(&mip->mi_activelink_lock);
-		return (B_FALSE);
-	}
-	mip->mi_activelink = B_TRUE;
-	mip->mi_shareable = shareable;
-	mutex_exit(&mip->mi_activelink_lock);
-	return (B_TRUE);
-}
-
 /*
- * Called by MAC clients. By default, active MAC clients cannot
- * share the NIC with VNICs.
+ * MAC Type Plugin functions.
  */
-boolean_t
-mac_active_set(mac_handle_t mh)
-{
-	return (mac_do_active_set(mh, B_FALSE));
-}
 
-/*
- * Called by MAC clients which can share the NIC with VNICS, e.g. DLS.
- */
-boolean_t
-mac_active_shareable_set(mac_handle_t mh)
+mactype_t *
+mactype_getplugin(const char *pname)
 {
-	return (mac_do_active_set(mh, B_TRUE));
-}
-
-void
-mac_active_clear(mac_handle_t mh)
-{
-	mac_impl_t *mip = (mac_impl_t *)mh;
-
-	mutex_enter(&mip->mi_activelink_lock);
-	ASSERT(mip->mi_activelink);
-	mip->mi_activelink = B_FALSE;
-	mutex_exit(&mip->mi_activelink_lock);
-}
-
-boolean_t
-mac_vnic_set(mac_handle_t mh, mac_txinfo_t *tx_info, mac_getcapab_t getcapab_fn,
-    void *getcapab_arg)
-{
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	mac_vnic_tx_t	*vnic_tx;
+	mactype_t	*mtype = NULL;
+	boolean_t	tried_modload = B_FALSE;
 
-	mutex_enter(&mip->mi_activelink_lock);
-	rw_enter(&mip->mi_tx_lock, RW_WRITER);
-	ASSERT(!mip->mi_vnic_present);
+	mutex_enter(&i_mactype_lock);
 
-	if (mip->mi_activelink && !mip->mi_shareable) {
+find_registered_mactype:
+	if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
+	    (mod_hash_val_t *)&mtype) != 0) {
+		if (!tried_modload) {
+			/*
+			 * If the plugin has not yet been loaded, then
+			 * attempt to load it now.  If modload() succeeds,
+			 * the plugin should have registered using
+			 * mactype_register(), in which case we can go back
+			 * and attempt to find it again.
+			 */
+			if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
+				tried_modload = B_TRUE;
+				goto find_registered_mactype;
+			}
+		}
+	} else {
 		/*
-		 * The NIC is already used by an active client which cannot
-		 * share it with VNICs.
+		 * Note that there's no danger that the plugin we've loaded
+		 * could be unloaded between the modload() step and the
+		 * reference count bump here, as we're holding
+		 * i_mactype_lock, which mactype_unregister() also holds.
 		 */
-		rw_exit(&mip->mi_tx_lock);
-		mutex_exit(&mip->mi_activelink_lock);
-		return (B_FALSE);
-	}
-
-	vnic_tx = kmem_cache_alloc(mac_vnic_tx_cache, KM_SLEEP);
-	vnic_tx->mv_refs = 0;
-	vnic_tx->mv_txinfo = *tx_info;
-	vnic_tx->mv_clearing = B_FALSE;
-
-	mip->mi_vnic_present = B_TRUE;
-	mip->mi_vnic_tx = vnic_tx;
-	mip->mi_vnic_getcapab_fn = getcapab_fn;
-	mip->mi_vnic_getcapab_arg = getcapab_arg;
-	rw_exit(&mip->mi_tx_lock);
-	mutex_exit(&mip->mi_activelink_lock);
-
-	i_mac_notify(mip, MAC_NOTE_VNIC);
-	return (B_TRUE);
-}
-
-void
-mac_vnic_clear(mac_handle_t mh)
-{
-	mac_impl_t *mip = (mac_impl_t *)mh;
-	mac_vnic_tx_t	*vnic_tx;
-
-	rw_enter(&mip->mi_tx_lock, RW_WRITER);
-	ASSERT(mip->mi_vnic_present);
-	mip->mi_vnic_present = B_FALSE;
-	/*
-	 * Setting mi_vnic_tx to NULL here under the lock guarantees
-	 * that no new references to the current VNIC transmit structure
-	 * will be taken by mac_vnic_tx(). This is a necessary condition
-	 * for safely waiting for the reference count to drop to
-	 * zero below.
-	 */
-	vnic_tx = mip->mi_vnic_tx;
-	mip->mi_vnic_tx = NULL;
-	mip->mi_vnic_getcapab_fn = NULL;
-	mip->mi_vnic_getcapab_arg = NULL;
-	rw_exit(&mip->mi_tx_lock);
-
-	i_mac_notify(mip, MAC_NOTE_VNIC);
-
-	/*
-	 * Wait for all TX calls referencing the VNIC transmit
-	 * entry point that was removed to complete.
-	 */
-	mutex_enter(&vnic_tx->mv_lock);
-	vnic_tx->mv_clearing = B_TRUE;
-	while (vnic_tx->mv_refs > 0)
-		cv_wait(&vnic_tx->mv_cv, &vnic_tx->mv_lock);
-	mutex_exit(&vnic_tx->mv_lock);
-	kmem_cache_free(mac_vnic_tx_cache, vnic_tx);
-}
-
-/*
- * mac_info_get() is used for retrieving the mac_info when a DL_INFO_REQ is
- * issued before a DL_ATTACH_REQ. we walk the i_mac_impl_hash table and find
- * the first mac_impl_t with a matching driver name; then we copy its mac_info_t
- * to the caller. we do all this with i_mac_impl_lock held so the mac_impl_t
- * cannot disappear while we are accessing it.
- */
-typedef struct i_mac_info_state_s {
-	const char	*mi_name;
-	mac_info_t	*mi_infop;
-} i_mac_info_state_t;
-
-/*ARGSUSED*/
-static uint_t
-i_mac_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
-{
-	i_mac_info_state_t	*statep = arg;
-	mac_impl_t		*mip = (mac_impl_t *)val;
-
-	if (mip->mi_disabled)
-		return (MH_WALK_CONTINUE);
-
-	if (strcmp(statep->mi_name,
-	    ddi_driver_name(mip->mi_dip)) != 0)
-		return (MH_WALK_CONTINUE);
-
-	statep->mi_infop = &mip->mi_info;
-	return (MH_WALK_TERMINATE);
-}
-
-boolean_t
-mac_info_get(const char *name, mac_info_t *minfop)
-{
-	i_mac_info_state_t	state;
-
-	rw_enter(&i_mac_impl_lock, RW_READER);
-	state.mi_name = name;
-	state.mi_infop = NULL;
-	mod_hash_walk(i_mac_impl_hash, i_mac_info_walker, &state);
-	if (state.mi_infop == NULL) {
-		rw_exit(&i_mac_impl_lock);
-		return (B_FALSE);
-	}
-	*minfop = *state.mi_infop;
-	rw_exit(&i_mac_impl_lock);
-	return (B_TRUE);
-}
-
-boolean_t
-mac_do_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data,
-    boolean_t is_vnic)
-{
-	mac_impl_t *mip = (mac_impl_t *)mh;
-
-	if (!is_vnic) {
-		rw_enter(&mip->mi_tx_lock, RW_READER);
-		if (mip->mi_vnic_present) {
-			boolean_t rv;
-
-			rv = mip->mi_vnic_getcapab_fn(mip->mi_vnic_getcapab_arg,
-			    cap, cap_data);
-			rw_exit(&mip->mi_tx_lock);
-			return (rv);
-		}
-		rw_exit(&mip->mi_tx_lock);
-	}
-
-	if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB)
-		return (mip->mi_getcapab(mip->mi_driver, cap, cap_data));
-	else
-		return (B_FALSE);
-}
-
-boolean_t
-mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
-{
-	return (mac_do_capab_get(mh, cap, cap_data, B_FALSE));
-}
-
-boolean_t
-mac_vnic_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
-{
-	return (mac_do_capab_get(mh, cap, cap_data, B_TRUE));
-}
-
-boolean_t
-mac_sap_verify(mac_handle_t mh, uint32_t sap, uint32_t *bind_sap)
-{
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	return (mip->mi_type->mt_ops.mtops_sap_verify(sap, bind_sap,
-	    mip->mi_pdata));
-}
-
-mblk_t *
-mac_header(mac_handle_t mh, const uint8_t *daddr, uint32_t sap, mblk_t *payload,
-    size_t extra_len)
-{
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	return (mip->mi_type->mt_ops.mtops_header(mip->mi_addr, daddr, sap,
-	    mip->mi_pdata, payload, extra_len));
-}
-
-int
-mac_header_info(mac_handle_t mh, mblk_t *mp, mac_header_info_t *mhip)
-{
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	return (mip->mi_type->mt_ops.mtops_header_info(mp, mip->mi_pdata,
-	    mhip));
-}
-
-mblk_t *
-mac_header_cook(mac_handle_t mh, mblk_t *mp)
-{
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	if (mip->mi_type->mt_ops.mtops_ops & MTOPS_HEADER_COOK) {
-		if (DB_REF(mp) > 1) {
-			mblk_t *newmp = copymsg(mp);
-			if (newmp == NULL)
-				return (NULL);
-			freemsg(mp);
-			mp = newmp;
-		}
-		return (mip->mi_type->mt_ops.mtops_header_cook(mp,
-		    mip->mi_pdata));
-	}
-	return (mp);
-}
-
-mblk_t *
-mac_header_uncook(mac_handle_t mh, mblk_t *mp)
-{
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-	if (mip->mi_type->mt_ops.mtops_ops & MTOPS_HEADER_UNCOOK) {
-		if (DB_REF(mp) > 1) {
-			mblk_t *newmp = copymsg(mp);
-			if (newmp == NULL)
-				return (NULL);
-			freemsg(mp);
-			mp = newmp;
-		}
-		return (mip->mi_type->mt_ops.mtops_header_uncook(mp,
-		    mip->mi_pdata));
+		atomic_inc_32(&mtype->mt_ref);
 	}
-	return (mp);
-}
-
-void
-mac_init_ops(struct dev_ops *ops, const char *name)
-{
-	dld_init_ops(ops, name);
-}
 
-void
-mac_fini_ops(struct dev_ops *ops)
-{
-	dld_fini_ops(ops);
+	mutex_exit(&i_mactype_lock);
+	return (mtype);
 }
 
-/*
- * MAC Type Plugin functions.
- */
-
 mactype_register_t *
 mactype_alloc(uint_t mactype_version)
 {
@@ -2878,19 +2686,70 @@ done:
 	return (err);
 }
 
+/*
+ * Returns TRUE when the specified property is intended for the MAC framework,
+ * as opposed to driver defined properties.
+ */
+static boolean_t
+mac_is_macprop(mac_prop_t *macprop)
+{
+	switch (macprop->mp_id) {
+	case MAC_PROP_MAXBW:
+	case MAC_PROP_PRIO:
+	case MAC_PROP_BIND_CPU:
+		return (B_TRUE);
+	default:
+		return (B_FALSE);
+	}
+}
+
+/*
+ * mac_set_prop() sets mac or hardware driver properties:
+ * 	mac properties include maxbw, priority, and cpu binding list. Driver
+ *	properties are private properties to the hardware, such as mtu, speed
+ *	etc.
+ * If the property is a driver property, mac_set_prop() calls driver's callback
+ * function to set it.
+ * If the property is a mac property, mac_set_prop() invokes mac_set_resources()
+ * which will cache the property value in mac_impl_t and may call
+ * mac_client_set_resource() to update property value of the primary mac client,
+ * if it exists.
+ */
 int
 mac_set_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize)
 {
 	int err = ENOTSUP;
 	mac_impl_t *mip = (mac_impl_t *)mh;
 
+	ASSERT(MAC_PERIM_HELD(mh));
+
+	/* If it is mac property, call mac_set_resources() */
+	if (mac_is_macprop(macprop)) {
+		mac_resource_props_t mrp;
+
+		if (valsize < sizeof (mac_resource_props_t))
+			return (EINVAL);
+		bzero(&mrp, sizeof (mac_resource_props_t));
+		bcopy(val, &mrp, sizeof (mrp));
+		return (mac_set_resources(mh, &mrp));
+	}
+	/* For driver properties, call driver's callback */
 	if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
 		err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
 		    macprop->mp_name, macprop->mp_id, valsize, val);
 	}
+
 	return (err);
 }
 
+/*
+ * mac_get_prop() gets mac or hardware driver properties.
+ *
+ * If the property is a driver property, mac_get_prop() calls driver's callback
+ * function to get it.
+ * If the property is a mac property, mac_get_prop() invokes mac_get_resources()
+ * which returns the cached value in mac_impl_t.
+ */
 int
 mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
     uint_t *perm)
@@ -2900,6 +2759,18 @@ mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
 	uint32_t sdu;
 	link_state_t link_state;
 
+	/* If mac property, read from cache */
+	if (mac_is_macprop(macprop)) {
+		mac_resource_props_t mrp;
+
+		if (valsize < sizeof (mac_resource_props_t))
+			return (EINVAL);
+		bzero(&mrp, sizeof (mac_resource_props_t));
+		mac_get_resources(mh, &mrp);
+		bcopy(&mrp, val, sizeof (mac_resource_props_t));
+		return (0);
+	}
+
 	switch (macprop->mp_id) {
 	case MAC_PROP_MTU:
 		if (valsize < sizeof (sdu))
@@ -2932,7 +2803,9 @@ mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
 		return (0);
 	default:
 		break;
+
 	}
+	/* If driver property, request from driver */
 	if (mip->mi_callbacks->mc_callbacks & MC_GETPROP) {
 		err = mip->mi_callbacks->mc_getprop(mip->mi_driver,
 		    macprop->mp_name, macprop->mp_id, macprop->mp_flags,
@@ -2941,21 +2814,7 @@ mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
 	return (err);
 }
 
-int
-mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max)
-{
-	mac_impl_t	*mip = (mac_impl_t *)mh;
-
-	if (sdu_max <= mip->mi_sdu_min)
-		return (EINVAL);
-	mip->mi_sdu_max = sdu_max;
-
-	/* Send a MAC_NOTE_SDU_SIZE notification. */
-	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
-	return (0);
-}
-
-static void
+void
 mac_register_priv_prop(mac_impl_t *mip, mac_priv_prop_t *mpp, uint_t nprop)
 {
 	mac_priv_prop_t *mpriv;
@@ -2969,7 +2828,7 @@ mac_register_priv_prop(mac_impl_t *mip, mac_priv_prop_t *mpp, uint_t nprop)
 	mip->mi_priv_prop_count = nprop;
 }
 
-static void
+void
 mac_unregister_priv_prop(mac_impl_t *mip)
 {
 	mac_priv_prop_t	*mpriv;
@@ -2981,3 +2840,2283 @@ mac_unregister_priv_prop(mac_impl_t *mip)
 	}
 	mip->mi_priv_prop_count = 0;
 }
+
+/*
+ * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
+ * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
+ * cases if MAC free's the ring structure after mac_stop_ring(), any
+ * illegal access to the ring structure coming from the driver will panic
+ * the system. In order to protect the system from such inadverent access,
+ * we maintain a cache of rings in the mac_impl_t after they get free'd up.
+ * When packets are received on free'd up rings, MAC (through the generation
+ * count mechanism) will drop such packets.
+ */
+static mac_ring_t *
+mac_ring_alloc(mac_impl_t *mip, mac_capab_rings_t *cap_rings)
+{
+	mac_ring_t *ring;
+
+	if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
+		mutex_enter(&mip->mi_ring_lock);
+		if (mip->mi_ring_freelist != NULL) {
+			ring = mip->mi_ring_freelist;
+			mip->mi_ring_freelist = ring->mr_next;
+			bzero(ring, sizeof (mac_ring_t));
+		} else {
+			ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
+		}
+		mutex_exit(&mip->mi_ring_lock);
+	} else {
+		ring = kmem_zalloc(sizeof (mac_ring_t), KM_SLEEP);
+	}
+	ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
+	return (ring);
+}
+
+static void
+mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
+{
+	if (ring->mr_type == MAC_RING_TYPE_RX) {
+		mutex_enter(&mip->mi_ring_lock);
+		ring->mr_state = MR_FREE;
+		ring->mr_flag = 0;
+		ring->mr_next = mip->mi_ring_freelist;
+		mip->mi_ring_freelist = ring;
+		mutex_exit(&mip->mi_ring_lock);
+	} else {
+		kmem_free(ring, sizeof (mac_ring_t));
+	}
+}
+
+static void
+mac_ring_freeall(mac_impl_t *mip)
+{
+	mac_ring_t *ring_next;
+	mutex_enter(&mip->mi_ring_lock);
+	mac_ring_t *ring = mip->mi_ring_freelist;
+	while (ring != NULL) {
+		ring_next = ring->mr_next;
+		kmem_cache_free(mac_ring_cache, ring);
+		ring = ring_next;
+	}
+	mip->mi_ring_freelist = NULL;
+	mutex_exit(&mip->mi_ring_lock);
+}
+
+int
+mac_start_ring(mac_ring_t *ring)
+{
+	int rv = 0;
+
+	if (ring->mr_start != NULL)
+		rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
+
+	return (rv);
+}
+
+void
+mac_stop_ring(mac_ring_t *ring)
+{
+	if (ring->mr_stop != NULL)
+		ring->mr_stop(ring->mr_driver);
+
+	/*
+	 * Increment the ring generation number for this ring.
+	 */
+	ring->mr_gen_num++;
+}
+
+int
+mac_start_group(mac_group_t *group)
+{
+	int rv = 0;
+
+	if (group->mrg_start != NULL)
+		rv = group->mrg_start(group->mrg_driver);
+
+	return (rv);
+}
+
+void
+mac_stop_group(mac_group_t *group)
+{
+	if (group->mrg_stop != NULL)
+		group->mrg_stop(group->mrg_driver);
+}
+
+/*
+ * Called from mac_start() on the default Rx group. Broadcast and multicast
+ * packets are received only on the default group. Hence the default group
+ * needs to be up even if the primary client is not up, for the other groups
+ * to be functional. We do this by calling this function at mac_start time
+ * itself. However the broadcast packets that are received can't make their
+ * way beyond mac_rx until a mac client creates a broadcast flow.
+ */
+static int
+mac_start_group_and_rings(mac_group_t *group)
+{
+	mac_ring_t	*ring;
+	int		rv = 0;
+
+	ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
+	if ((rv = mac_start_group(group)) != 0)
+		return (rv);
+
+	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
+		ASSERT(ring->mr_state == MR_FREE);
+		if ((rv = mac_start_ring(ring)) != 0)
+			goto error;
+		ring->mr_state = MR_INUSE;
+		ring->mr_classify_type = MAC_SW_CLASSIFIER;
+	}
+	return (0);
+
+error:
+	mac_stop_group_and_rings(group);
+	return (rv);
+}
+
+/* Called from mac_stop on the default Rx group */
+static void
+mac_stop_group_and_rings(mac_group_t *group)
+{
+	mac_ring_t	*ring;
+
+	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
+		if (ring->mr_state != MR_FREE) {
+			mac_stop_ring(ring);
+			ring->mr_state = MR_FREE;
+			ring->mr_flag = 0;
+			ring->mr_classify_type = MAC_NO_CLASSIFIER;
+		}
+	}
+	mac_stop_group(group);
+}
+
+
+static mac_ring_t *
+mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
+    mac_capab_rings_t *cap_rings)
+{
+	mac_ring_t *ring;
+	mac_ring_info_t ring_info;
+
+	ring = mac_ring_alloc(mip, cap_rings);
+
+	/* Prepare basic information of ring */
+	ring->mr_index = index;
+	ring->mr_type = group->mrg_type;
+	ring->mr_gh = (mac_group_handle_t)group;
+
+	/* Insert the new ring to the list. */
+	ring->mr_next = group->mrg_rings;
+	group->mrg_rings = ring;
+
+	/* Zero to reuse the info data structure */
+	bzero(&ring_info, sizeof (ring_info));
+
+	/* Query ring information from driver */
+	cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
+	    index, &ring_info, (mac_ring_handle_t)ring);
+
+	ring->mr_info = ring_info;
+
+	/* Update ring's status */
+	ring->mr_state = MR_FREE;
+	ring->mr_flag = 0;
+
+	/* Update the ring count of the group */
+	group->mrg_cur_count++;
+	return (ring);
+}
+
+/*
+ * Rings are chained together for easy regrouping.
+ */
+static void
+mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
+    mac_capab_rings_t *cap_rings)
+{
+	int index;
+
+	/*
+	 * Initialize all ring members of this group. Size of zero will not
+	 * enter the loop, so it's safe for initializing an empty group.
+	 */
+	for (index = size - 1; index >= 0; index--)
+		(void) mac_init_ring(mip, group, index, cap_rings);
+}
+
+int
+mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
+{
+	mac_capab_rings_t *cap_rings;
+	mac_group_t *group, *groups;
+	mac_group_info_t group_info;
+	uint_t group_free = 0;
+	uint_t ring_left;
+	mac_ring_t *ring;
+	int g, err = 0;
+
+	switch (rtype) {
+	case MAC_RING_TYPE_RX:
+		ASSERT(mip->mi_rx_groups == NULL);
+
+		cap_rings = &mip->mi_rx_rings_cap;
+		cap_rings->mr_type = MAC_RING_TYPE_RX;
+		break;
+	case MAC_RING_TYPE_TX:
+		ASSERT(mip->mi_tx_groups == NULL);
+
+		cap_rings = &mip->mi_tx_rings_cap;
+		cap_rings->mr_type = MAC_RING_TYPE_TX;
+		break;
+	default:
+		ASSERT(B_FALSE);
+	}
+
+	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS,
+	    cap_rings))
+		return (0);
+
+	/*
+	 * Allocate a contiguous buffer for all groups.
+	 */
+	groups = kmem_zalloc(sizeof (mac_group_t) * (cap_rings->mr_gnum + 1),
+	    KM_SLEEP);
+
+	ring_left = cap_rings->mr_rnum;
+
+	/*
+	 * Get all ring groups if any, and get their ring members
+	 * if any.
+	 */
+	for (g = 0; g < cap_rings->mr_gnum; g++) {
+		group = groups + g;
+
+		/* Prepare basic information of the group */
+		group->mrg_index = g;
+		group->mrg_type = rtype;
+		group->mrg_state = MAC_GROUP_STATE_UNINIT;
+		group->mrg_mh = (mac_handle_t)mip;
+		group->mrg_next = group + 1;
+
+		/* Zero to reuse the info data structure */
+		bzero(&group_info, sizeof (group_info));
+
+		/* Query group information from driver */
+		cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
+		    (mac_group_handle_t)group);
+
+		switch (cap_rings->mr_group_type) {
+		case MAC_GROUP_TYPE_DYNAMIC:
+			if (cap_rings->mr_gaddring == NULL ||
+			    cap_rings->mr_gremring == NULL) {
+				DTRACE_PROBE3(
+				    mac__init__rings_no_addremring,
+				    char *, mip->mi_name,
+				    mac_group_add_ring_t,
+				    cap_rings->mr_gaddring,
+				    mac_group_add_ring_t,
+				    cap_rings->mr_gremring);
+				err = EINVAL;
+				goto bail;
+			}
+
+			switch (rtype) {
+			case MAC_RING_TYPE_RX:
+				/*
+				 * The first RX group must have non-zero
+				 * rings, and the following groups must
+				 * have zero rings.
+				 */
+				if (g == 0 && group_info.mgi_count == 0) {
+					DTRACE_PROBE1(
+					    mac__init__rings__rx__def__zero,
+					    char *, mip->mi_name);
+					err = EINVAL;
+					goto bail;
+				}
+				if (g > 0 && group_info.mgi_count != 0) {
+					DTRACE_PROBE3(
+					    mac__init__rings__rx__nonzero,
+					    char *, mip->mi_name,
+					    int, g, int, group_info.mgi_count);
+					err = EINVAL;
+					goto bail;
+				}
+				break;
+			case MAC_RING_TYPE_TX:
+				/*
+				 * All TX ring groups must have zero rings.
+				 */
+				if (group_info.mgi_count != 0) {
+					DTRACE_PROBE3(
+					    mac__init__rings__tx__nonzero,
+					    char *, mip->mi_name,
+					    int, g, int, group_info.mgi_count);
+					err = EINVAL;
+					goto bail;
+				}
+				break;
+			}
+			break;
+		case MAC_GROUP_TYPE_STATIC:
+			/*
+			 * Note that an empty group is allowed, e.g., an aggr
+			 * would start with an empty group.
+			 */
+			break;
+		default:
+			/* unknown group type */
+			DTRACE_PROBE2(mac__init__rings__unknown__type,
+			    char *, mip->mi_name,
+			    int, cap_rings->mr_group_type);
+			err = EINVAL;
+			goto bail;
+		}
+
+
+		/*
+		 * Driver must register group->mgi_addmac/remmac() for rx groups
+		 * to support multiple MAC addresses.
+		 */
+		if (rtype == MAC_RING_TYPE_RX) {
+			if ((group_info.mgi_addmac == NULL) ||
+			    (group_info.mgi_addmac == NULL))
+				goto bail;
+		}
+
+		/* Cache driver-supplied information */
+		group->mrg_info = group_info;
+
+		/* Update the group's status and group count. */
+		mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
+		group_free++;
+
+		group->mrg_rings = NULL;
+		group->mrg_cur_count = 0;
+		mac_init_group(mip, group, group_info.mgi_count, cap_rings);
+		ring_left -= group_info.mgi_count;
+
+		/* The current group size should be equal to default value */
+		ASSERT(group->mrg_cur_count == group_info.mgi_count);
+	}
+
+	/* Build up a dummy group for free resources as a pool */
+	group = groups + cap_rings->mr_gnum;
+
+	/* Prepare basic information of the group */
+	group->mrg_index = -1;
+	group->mrg_type = rtype;
+	group->mrg_state = MAC_GROUP_STATE_UNINIT;
+	group->mrg_mh = (mac_handle_t)mip;
+	group->mrg_next = NULL;
+
+	/*
+	 * If there are ungrouped rings, allocate a continuous buffer for
+	 * remaining resources.
+	 */
+	if (ring_left != 0) {
+		group->mrg_rings = NULL;
+		group->mrg_cur_count = 0;
+		mac_init_group(mip, group, ring_left, cap_rings);
+
+		/* The current group size should be equal to ring_left */
+		ASSERT(group->mrg_cur_count == ring_left);
+
+		ring_left = 0;
+
+		/* Update this group's status */
+		mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
+	} else
+		group->mrg_rings = NULL;
+
+	ASSERT(ring_left == 0);
+
+bail:
+	/* Cache other important information to finalize the initialization */
+	switch (rtype) {
+	case MAC_RING_TYPE_RX:
+		mip->mi_rx_group_type = cap_rings->mr_group_type;
+		mip->mi_rx_group_count = cap_rings->mr_gnum;
+		mip->mi_rx_groups = groups;
+		break;
+	case MAC_RING_TYPE_TX:
+		mip->mi_tx_group_type = cap_rings->mr_group_type;
+		mip->mi_tx_group_count = cap_rings->mr_gnum;
+		mip->mi_tx_group_free = group_free;
+		mip->mi_tx_groups = groups;
+
+		/*
+		 * Ring 0 is used as the default one and it could be assigned
+		 * to a client as well.
+		 */
+		group = groups + cap_rings->mr_gnum;
+		ring = group->mrg_rings;
+		while ((ring->mr_index != 0) && (ring->mr_next != NULL))
+			ring = ring->mr_next;
+		ASSERT(ring->mr_index == 0);
+		mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
+		break;
+	default:
+		ASSERT(B_FALSE);
+	}
+
+	if (err != 0)
+		mac_free_rings(mip, rtype);
+
+	return (err);
+}
+
+/*
+ * Called to free all ring groups with particular type. It's supposed all groups
+ * have been released by clinet.
+ */
+void
+mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
+{
+	mac_group_t *group, *groups;
+	uint_t group_count;
+
+	switch (rtype) {
+	case MAC_RING_TYPE_RX:
+		if (mip->mi_rx_groups == NULL)
+			return;
+
+		groups = mip->mi_rx_groups;
+		group_count = mip->mi_rx_group_count;
+
+		mip->mi_rx_groups = NULL;
+		mip->mi_rx_group_count = 0;
+		break;
+	case MAC_RING_TYPE_TX:
+		ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
+
+		if (mip->mi_tx_groups == NULL)
+			return;
+
+		groups = mip->mi_tx_groups;
+		group_count = mip->mi_tx_group_count;
+
+		mip->mi_tx_groups = NULL;
+		mip->mi_tx_group_count = 0;
+		mip->mi_tx_group_free = 0;
+		mip->mi_default_tx_ring = NULL;
+		break;
+	default:
+		ASSERT(B_FALSE);
+	}
+
+	for (group = groups; group != NULL; group = group->mrg_next) {
+		mac_ring_t *ring;
+
+		if (group->mrg_cur_count == 0)
+			continue;
+
+		ASSERT(group->mrg_rings != NULL);
+
+		while ((ring = group->mrg_rings) != NULL) {
+			group->mrg_rings = ring->mr_next;
+			mac_ring_free(mip, ring);
+		}
+	}
+
+	/* Free all the cached rings */
+	mac_ring_freeall(mip);
+	/* Free the block of group data strutures */
+	kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
+}
+
+/*
+ * Associate a MAC address with a receive group.
+ *
+ * The return value of this function should always be checked properly, because
+ * any type of failure could cause unexpected results. A group can be added
+ * or removed with a MAC address only after it has been reserved. Ideally,
+ * a successful reservation always leads to calling mac_group_addmac() to
+ * steer desired traffic. Failure of adding an unicast MAC address doesn't
+ * always imply that the group is functioning abnormally.
+ *
+ * Currently this function is called everywhere, and it reflects assumptions
+ * about MAC addresses in the implementation. CR 6735196.
+ */
+int
+mac_group_addmac(mac_group_t *group, const uint8_t *addr)
+{
+	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
+	ASSERT(group->mrg_info.mgi_addmac != NULL);
+
+	return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
+}
+
+/*
+ * Remove the association between MAC address and receive group.
+ */
+int
+mac_group_remmac(mac_group_t *group, const uint8_t *addr)
+{
+	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
+	ASSERT(group->mrg_info.mgi_remmac != NULL);
+
+	return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
+}
+
+/*
+ * Release a ring in use by marking it MR_FREE.
+ * Any other client may reserve it for its use.
+ */
+void
+mac_release_tx_ring(mac_ring_handle_t rh)
+{
+	mac_ring_t *ring = (mac_ring_t *)rh;
+	mac_group_t *group = (mac_group_t *)ring->mr_gh;
+	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+	ASSERT(ring->mr_state != MR_FREE);
+
+	/*
+	 * Default tx ring will be released by mac_stop().
+	 */
+	if (rh == mip->mi_default_tx_ring)
+		return;
+
+	mac_stop_ring(ring);
+
+	ring->mr_state = MR_FREE;
+	ring->mr_flag = 0;
+}
+
+/*
+ * Send packets through a selected tx ring.
+ */
+mblk_t *
+mac_ring_tx(mac_ring_handle_t rh, mblk_t *mp)
+{
+	mac_ring_t *ring = (mac_ring_t *)rh;
+	mac_ring_info_t *info = &ring->mr_info;
+
+	ASSERT(ring->mr_type == MAC_RING_TYPE_TX);
+	ASSERT(ring->mr_state >= MR_INUSE);
+	ASSERT(info->mri_tx != NULL);
+
+	return (info->mri_tx(info->mri_driver, mp));
+}
+
+/*
+ * Find a ring from its index.
+ */
+mac_ring_t *
+mac_find_ring(mac_group_t *group, int index)
+{
+	mac_ring_t *ring = group->mrg_rings;
+
+	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
+		if (ring->mr_index == index)
+			break;
+
+	return (ring);
+}
+/*
+ * Add a ring to an existing group.
+ *
+ * The ring must be either passed directly (for example if the ring
+ * movement is initiated by the framework), or specified through a driver
+ * index (for example when the ring is added by the driver.
+ *
+ * The caller needs to call mac_perim_enter() before calling this function.
+ */
+int
+i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
+{
+	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
+	mac_capab_rings_t *cap_rings;
+	boolean_t driver_call = (ring == NULL);
+	mac_group_type_t group_type;
+	int ret = 0;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	switch (group->mrg_type) {
+	case MAC_RING_TYPE_RX:
+		cap_rings = &mip->mi_rx_rings_cap;
+		group_type = mip->mi_rx_group_type;
+		break;
+	case MAC_RING_TYPE_TX:
+		cap_rings = &mip->mi_tx_rings_cap;
+		group_type = mip->mi_tx_group_type;
+		break;
+	default:
+		ASSERT(B_FALSE);
+	}
+
+	/*
+	 * There should be no ring with the same ring index in the target
+	 * group.
+	 */
+	ASSERT(mac_find_ring(group, driver_call ? index : ring->mr_index) ==
+	    NULL);
+
+	if (driver_call) {
+		/*
+		 * The function is called as a result of a request from
+		 * a driver to add a ring to an existing group, for example
+		 * from the aggregation driver. Allocate a new mac_ring_t
+		 * for that ring.
+		 */
+		ring = mac_init_ring(mip, group, index, cap_rings);
+		ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
+	} else {
+		/*
+		 * The function is called as a result of a MAC layer request
+		 * to add a ring to an existing group. In this case the
+		 * ring is being moved between groups, which requires
+		 * the underlying driver to support dynamic grouping,
+		 * and the mac_ring_t already exists.
+		 */
+		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
+		ASSERT(cap_rings->mr_gaddring != NULL);
+		ASSERT(ring->mr_gh == NULL);
+	}
+
+	/*
+	 * At this point the ring should not be in use, and it should be
+	 * of the right for the target group.
+	 */
+	ASSERT(ring->mr_state < MR_INUSE);
+	ASSERT(ring->mr_srs == NULL);
+	ASSERT(ring->mr_type == group->mrg_type);
+
+	if (!driver_call) {
+		/*
+		 * Add the driver level hardware ring if the process was not
+		 * initiated by the driver, and the target group is not the
+		 * group.
+		 */
+		if (group->mrg_driver != NULL) {
+			cap_rings->mr_gaddring(group->mrg_driver,
+			    ring->mr_driver, ring->mr_type);
+		}
+
+		/*
+		 * Insert the ring ahead existing rings.
+		 */
+		ring->mr_next = group->mrg_rings;
+		group->mrg_rings = ring;
+		ring->mr_gh = (mac_group_handle_t)group;
+		group->mrg_cur_count++;
+	}
+
+	/*
+	 * If the group has not been actively used, we're done.
+	 */
+	if (group->mrg_index != -1 &&
+	    group->mrg_state < MAC_GROUP_STATE_RESERVED)
+		return (0);
+
+	/*
+	 * Set up SRS/SR according to the ring type.
+	 */
+	switch (ring->mr_type) {
+	case MAC_RING_TYPE_RX:
+		/*
+		 * Setup SRS on top of the new ring if the group is
+		 * reserved for someones exclusive use.
+		 */
+		if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
+			flow_entry_t *flent;
+			mac_client_impl_t *mcip;
+
+			mcip = MAC_RX_GROUP_ONLY_CLIENT(group);
+			ASSERT(mcip != NULL);
+			flent = mcip->mci_flent;
+			ASSERT(flent->fe_rx_srs_cnt > 0);
+			mac_srs_group_setup(mcip, flent, group, SRST_LINK);
+		}
+		break;
+	case MAC_RING_TYPE_TX:
+		/*
+		 * For TX this function is only invoked during the
+		 * initial creation of a group when a share is
+		 * associated with a MAC client. So the datapath is not
+		 * yet setup, and will be setup later after the
+		 * group has been reserved and populated.
+		 */
+		break;
+	default:
+		ASSERT(B_FALSE);
+	}
+
+	/*
+	 * Start the ring if needed. Failure causes to undo the grouping action.
+	 */
+	if ((ret = mac_start_ring(ring)) != 0) {
+		if (ring->mr_type == MAC_RING_TYPE_RX) {
+			if (ring->mr_srs != NULL) {
+				mac_rx_srs_remove(ring->mr_srs);
+				ring->mr_srs = NULL;
+			}
+		}
+		if (!driver_call) {
+			cap_rings->mr_gremring(group->mrg_driver,
+			    ring->mr_driver, ring->mr_type);
+		}
+		group->mrg_cur_count--;
+		group->mrg_rings = ring->mr_next;
+
+		ring->mr_gh = NULL;
+
+		if (driver_call)
+			mac_ring_free(mip, ring);
+
+		return (ret);
+	}
+
+	/*
+	 * Update the ring's state.
+	 */
+	ring->mr_state = MR_INUSE;
+	MAC_RING_UNMARK(ring, MR_INCIPIENT);
+	return (0);
+}
+
+/*
+ * Remove a ring from it's current group. MAC internal function for dynamic
+ * grouping.
+ *
+ * The caller needs to call mac_perim_enter() before calling this function.
+ */
+void
+i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
+    boolean_t driver_call)
+{
+	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
+	mac_capab_rings_t *cap_rings = NULL;
+	mac_group_type_t group_type;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	ASSERT(mac_find_ring(group, ring->mr_index) == ring);
+	ASSERT((mac_group_t *)ring->mr_gh == group);
+	ASSERT(ring->mr_type == group->mrg_type);
+
+	switch (ring->mr_type) {
+	case MAC_RING_TYPE_RX:
+		group_type = mip->mi_rx_group_type;
+		cap_rings = &mip->mi_rx_rings_cap;
+
+		if (group->mrg_state >= MAC_GROUP_STATE_RESERVED)
+			mac_stop_ring(ring);
+
+		/*
+		 * Only hardware classified packets hold a reference to the
+		 * ring all the way up the Rx path. mac_rx_srs_remove()
+		 * will take care of quiescing the Rx path and removing the
+		 * SRS. The software classified path neither holds a reference
+		 * nor any association with the ring in mac_rx.
+		 */
+		if (ring->mr_srs != NULL) {
+			mac_rx_srs_remove(ring->mr_srs);
+			ring->mr_srs = NULL;
+		}
+		ring->mr_state = MR_FREE;
+		ring->mr_flag = 0;
+
+		break;
+	case MAC_RING_TYPE_TX:
+		/*
+		 * For TX this function is only invoked in two
+		 * cases:
+		 *
+		 * 1) In the case of a failure during the
+		 * initial creation of a group when a share is
+		 * associated with a MAC client. So the SRS is not
+		 * yet setup, and will be setup later after the
+		 * group has been reserved and populated.
+		 *
+		 * 2) From mac_release_tx_group() when freeing
+		 * a TX SRS.
+		 *
+		 * In both cases the SRS and its soft rings are
+		 * already quiesced.
+		 */
+		ASSERT(!driver_call);
+		group_type = mip->mi_tx_group_type;
+		cap_rings = &mip->mi_tx_rings_cap;
+		break;
+	default:
+		ASSERT(B_FALSE);
+	}
+
+	/*
+	 * Remove the ring from the group.
+	 */
+	if (ring == group->mrg_rings)
+		group->mrg_rings = ring->mr_next;
+	else {
+		mac_ring_t *pre;
+
+		pre = group->mrg_rings;
+		while (pre->mr_next != ring)
+			pre = pre->mr_next;
+		pre->mr_next = ring->mr_next;
+	}
+	group->mrg_cur_count--;
+
+	if (!driver_call) {
+		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
+		ASSERT(cap_rings->mr_gremring != NULL);
+
+		/*
+		 * Remove the driver level hardware ring.
+		 */
+		if (group->mrg_driver != NULL) {
+			cap_rings->mr_gremring(group->mrg_driver,
+			    ring->mr_driver, ring->mr_type);
+		}
+	}
+
+	ring->mr_gh = NULL;
+	if (driver_call) {
+		mac_ring_free(mip, ring);
+	} else {
+		ring->mr_state = MR_FREE;
+		ring->mr_flag = 0;
+	}
+}
+
+/*
+ * Move a ring to the target group. If needed, remove the ring from the group
+ * that it currently belongs to.
+ *
+ * The caller need to enter MAC's perimeter by calling mac_perim_enter().
+ */
+static int
+mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
+{
+	mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
+	int rv;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+	ASSERT(d_group != NULL);
+	ASSERT(s_group->mrg_mh == d_group->mrg_mh);
+
+	if (s_group == d_group)
+		return (0);
+
+	/*
+	 * Remove it from current group first.
+	 */
+	if (s_group != NULL)
+		i_mac_group_rem_ring(s_group, ring, B_FALSE);
+
+	/*
+	 * Add it to the new group.
+	 */
+	rv = i_mac_group_add_ring(d_group, ring, 0);
+	if (rv != 0) {
+		/*
+		 * Failed to add ring back to source group. If
+		 * that fails, the ring is stuck in limbo, log message.
+		 */
+		if (i_mac_group_add_ring(s_group, ring, 0)) {
+			cmn_err(CE_WARN, "%s: failed to move ring %p\n",
+			    mip->mi_name, (void *)ring);
+		}
+	}
+
+	return (rv);
+}
+
+/*
+ * Find a MAC address according to its value.
+ */
+mac_address_t *
+mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
+{
+	mac_address_t *map;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
+		if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
+			break;
+	}
+
+	return (map);
+}
+
+/*
+ * Check whether the MAC address is shared by multiple clients.
+ */
+boolean_t
+mac_check_macaddr_shared(mac_address_t *map)
+{
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
+
+	return (map->ma_nusers > 1);
+}
+
+/*
+ * Enable a MAC address by enabling promiscuous mode.
+ */
+static int
+mac_add_macaddr_promisc(mac_impl_t *mip, mac_group_t *group)
+{
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	/*
+	 * Current interface only allow to set promiscuous mode with the
+	 * default group. Note, mip->mi_rx_groups might be NULL.
+	 */
+	ASSERT(group == mip->mi_rx_groups);
+
+	if (group == mip->mi_rx_groups)
+		return (i_mac_promisc_set(mip, B_TRUE, MAC_DEVPROMISC));
+	else
+		return (ENOTSUP);
+}
+
+/*
+ * Remove a MAC address that was added by enabling promiscuous mode.
+ */
+static int
+mac_remove_macaddr_promisc(mac_impl_t *mip, mac_group_t *group)
+{
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+	ASSERT(group == mip->mi_rx_groups);
+
+	return (i_mac_promisc_set(mip, B_FALSE, MAC_DEVPROMISC));
+}
+
+/*
+ * Remove the specified MAC address from the MAC address list and free it.
+ */
+static void
+mac_free_macaddr(mac_address_t *map)
+{
+	mac_impl_t *mip = map->ma_mip;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+	ASSERT(mip->mi_addresses != NULL);
+
+	map = mac_find_macaddr(mip, map->ma_addr);
+
+	ASSERT(map != NULL);
+	ASSERT(map->ma_nusers == 0);
+
+	if (map == mip->mi_addresses) {
+		mip->mi_addresses = map->ma_next;
+	} else {
+		mac_address_t *pre;
+
+		pre = mip->mi_addresses;
+		while (pre->ma_next != map)
+			pre = pre->ma_next;
+		pre->ma_next = map->ma_next;
+	}
+
+	kmem_free(map, sizeof (mac_address_t));
+}
+
+/*
+ * Add a MAC address reference for a client. If the desired MAC address
+ * exists, add a reference to it. Otherwise, add the new address by adding
+ * it to a reserved group or setting promiscuous mode. Won't try different
+ * group is the group is non-NULL, so the caller must explictly share
+ * default group when needed.
+ *
+ * Note, the primary MAC address is initialized at registration time, so
+ * to add it to default group only need to activate it if its reference
+ * count is still zero. Also, some drivers may not have advertised RINGS
+ * capability.
+ */
+int
+mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr)
+{
+	mac_address_t *map;
+	int err = 0;
+	boolean_t allocated_map = B_FALSE;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	map = mac_find_macaddr(mip, mac_addr);
+
+	/*
+	 * If the new MAC address has not been added. Allocate a new one
+	 * and set it up.
+	 */
+	if (map == NULL) {
+		map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
+		map->ma_len = mip->mi_type->mt_addr_length;
+		bcopy(mac_addr, map->ma_addr, map->ma_len);
+		map->ma_nusers = 0;
+		map->ma_group = group;
+		map->ma_mip = mip;
+
+		/* add the new MAC address to the head of the address list */
+		map->ma_next = mip->mi_addresses;
+		mip->mi_addresses = map;
+
+		allocated_map = B_TRUE;
+	}
+
+	ASSERT(map->ma_group == group);
+
+	/*
+	 * If the MAC address is already in use, simply account for the
+	 * new client.
+	 */
+	if (map->ma_nusers++ > 0)
+		return (0);
+
+	/*
+	 * Activate this MAC address by adding it to the reserved group.
+	 */
+	if (group != NULL) {
+		err = mac_group_addmac(group, (const uint8_t *)mac_addr);
+		if (err == 0) {
+			map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
+			return (0);
+		}
+	}
+
+	/*
+	 * Try promiscuous mode. Note that rx_groups could be NULL, so we
+	 * need to handle drivers that don't advertise the RINGS capability.
+	 */
+	if (group == mip->mi_rx_groups) {
+		/*
+		 * For drivers that don't advertise RINGS capability, do
+		 * nothing for the primary address.
+		 */
+		if ((group == NULL) &&
+		    (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
+			map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
+			return (0);
+		}
+
+		/*
+		 * Enable promiscuous mode in order to receive traffic
+		 * to the new MAC address.
+		 */
+		err = mac_add_macaddr_promisc(mip, group);
+		if (err == 0) {
+			map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
+			return (0);
+		}
+	}
+
+	/*
+	 * Free the MAC address that could not be added. Don't free
+	 * a pre-existing address, it could have been the entry
+	 * for the primary MAC address which was pre-allocated by
+	 * mac_init_macaddr(), and which must remain on the list.
+	 */
+	map->ma_nusers--;
+	if (allocated_map)
+		mac_free_macaddr(map);
+	return (err);
+}
+
+/*
+ * Remove a reference to a MAC address. This may cause to remove the MAC
+ * address from an associated group or to turn off promiscuous mode.
+ * The caller needs to handle the failure properly.
+ */
+int
+mac_remove_macaddr(mac_address_t *map)
+{
+	mac_impl_t *mip = map->ma_mip;
+	int err = 0;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
+
+	/*
+	 * If it's not the last client using this MAC address, only update
+	 * the MAC clients count.
+	 */
+	if (--map->ma_nusers > 0)
+		return (0);
+
+	/*
+	 * The MAC address is no longer used by any MAC client, so remove
+	 * it from its associated group, or turn off promiscuous mode
+	 * if it was enabled for the MAC address.
+	 */
+	switch (map->ma_type) {
+	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
+		/*
+		 * Don't free the preset primary address for drivers that
+		 * don't advertise RINGS capability.
+		 */
+		if (map->ma_group == NULL)
+			return (0);
+
+		err = mac_group_remmac(map->ma_group, map->ma_addr);
+		break;
+	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
+		err = mac_remove_macaddr_promisc(mip, map->ma_group);
+		break;
+	default:
+		ASSERT(B_FALSE);
+	}
+
+	if (err != 0)
+		return (err);
+
+	/*
+	 * We created MAC address for the primary one at registration, so we
+	 * won't free it here. mac_fini_macaddr() will take care of it.
+	 */
+	if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
+		mac_free_macaddr(map);
+
+	return (0);
+}
+
+/*
+ * Update an existing MAC address. The caller need to make sure that the new
+ * value has not been used.
+ */
+int
+mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
+{
+	mac_impl_t *mip = map->ma_mip;
+	int err = 0;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
+
+	switch (map->ma_type) {
+	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
+		/*
+		 * Update the primary address for drivers that are not
+		 * RINGS capable.
+		 */
+		if (map->ma_group == NULL) {
+			err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
+			    mac_addr);
+			if (err != 0)
+				return (err);
+			break;
+		}
+
+		/*
+		 * If this MAC address is not currently in use,
+		 * simply break out and update the value.
+		 */
+		if (map->ma_nusers == 0)
+			break;
+
+		/*
+		 * Need to replace the MAC address associated with a group.
+		 */
+		err = mac_group_remmac(map->ma_group, map->ma_addr);
+		if (err != 0)
+			return (err);
+
+		err = mac_group_addmac(map->ma_group, mac_addr);
+
+		/*
+		 * Failure hints hardware error. The MAC layer needs to
+		 * have error notification facility to handle this.
+		 * Now, simply try to restore the value.
+		 */
+		if (err != 0)
+			(void) mac_group_addmac(map->ma_group, map->ma_addr);
+
+		break;
+	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
+		/*
+		 * Need to do nothing more if in promiscuous mode.
+		 */
+		break;
+	default:
+		ASSERT(B_FALSE);
+	}
+
+	/*
+	 * Successfully replaced the MAC address.
+	 */
+	if (err == 0)
+		bcopy(mac_addr, map->ma_addr, map->ma_len);
+
+	return (err);
+}
+
+/*
+ * Freshen the MAC address with new value. Its caller must have updated the
+ * hardware MAC address before calling this function.
+ * This funcitons is supposed to be used to handle the MAC address change
+ * notification from underlying drivers.
+ */
+void
+mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
+{
+	mac_impl_t *mip = map->ma_mip;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
+
+	/*
+	 * Freshen the MAC address with new value.
+	 */
+	bcopy(mac_addr, map->ma_addr, map->ma_len);
+	bcopy(mac_addr, mip->mi_addr, map->ma_len);
+
+	/*
+	 * Update all MAC clients that share this MAC address.
+	 */
+	mac_unicast_update_clients(mip, map);
+}
+
+/*
+ * Set up the primary MAC address.
+ */
+void
+mac_init_macaddr(mac_impl_t *mip)
+{
+	mac_address_t *map;
+
+	/*
+	 * The reference count is initialized to zero, until it's really
+	 * activated.
+	 */
+	map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
+	map->ma_len = mip->mi_type->mt_addr_length;
+	bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
+
+	/*
+	 * If driver advertises RINGS capability, it shouldn't have initialized
+	 * its primary MAC address. For other drivers, including VNIC, the
+	 * primary address must work after registration.
+	 */
+	if (mip->mi_rx_groups == NULL)
+		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
+
+	/*
+	 * The primary MAC address is reserved for default group according
+	 * to current design.
+	 */
+	map->ma_group = mip->mi_rx_groups;
+	map->ma_mip = mip;
+
+	mip->mi_addresses = map;
+}
+
+/*
+ * Clean up the primary MAC address. Note, only one primary MAC address
+ * is allowed. All other MAC addresses must have been freed appropriately.
+ */
+void
+mac_fini_macaddr(mac_impl_t *mip)
+{
+	mac_address_t *map = mip->mi_addresses;
+
+	/* there should be exactly one entry left on the list */
+	ASSERT(map != NULL);
+	ASSERT(map->ma_nusers == 0);
+	ASSERT(map->ma_next == NULL);
+
+	kmem_free(map, sizeof (mac_address_t));
+	mip->mi_addresses = NULL;
+}
+
+/*
+ * Logging related functions.
+ */
+
+/* Write the Flow description to the log file */
+int
+mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
+{
+	flow_desc_t		*fdesc;
+	mac_resource_props_t	*mrp;
+	net_desc_t		ndesc;
+
+	bzero(&ndesc, sizeof (net_desc_t));
+
+	/*
+	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
+	 * Updates to the fe_flow_desc are done under the fe_lock
+	 */
+	mutex_enter(&flent->fe_lock);
+	fdesc = &flent->fe_flow_desc;
+	mrp = &flent->fe_resource_props;
+
+	ndesc.nd_name = flent->fe_flow_name;
+	ndesc.nd_devname = mcip->mci_name;
+	bcopy(fdesc->fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
+	bcopy(fdesc->fd_dst_mac, ndesc.nd_edest, ETHERADDRL);
+	ndesc.nd_sap = htonl(fdesc->fd_sap);
+	ndesc.nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
+	ndesc.nd_bw_limit = mrp->mrp_maxbw;
+	if (ndesc.nd_isv4) {
+		ndesc.nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
+		ndesc.nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
+	} else {
+		bcopy(&fdesc->fd_local_addr, ndesc.nd_saddr, IPV6_ADDR_LEN);
+		bcopy(&fdesc->fd_remote_addr, ndesc.nd_daddr, IPV6_ADDR_LEN);
+	}
+	ndesc.nd_sport = htons(fdesc->fd_local_port);
+	ndesc.nd_dport = htons(fdesc->fd_remote_port);
+	ndesc.nd_protocol = (uint8_t)fdesc->fd_protocol;
+	mutex_exit(&flent->fe_lock);
+
+	return (exacct_commit_netinfo((void *)&ndesc, EX_NET_FLDESC_REC));
+}
+
+/* Write the Flow statistics to the log file */
+int
+mac_write_flow_stats(flow_entry_t *flent)
+{
+	flow_stats_t	*fl_stats;
+	net_stat_t	nstat;
+
+	fl_stats = &flent->fe_flowstats;
+	nstat.ns_name = flent->fe_flow_name;
+	nstat.ns_ibytes = fl_stats->fs_rbytes;
+	nstat.ns_obytes = fl_stats->fs_obytes;
+	nstat.ns_ipackets = fl_stats->fs_ipackets;
+	nstat.ns_opackets = fl_stats->fs_opackets;
+	nstat.ns_ierrors = fl_stats->fs_ierrors;
+	nstat.ns_oerrors = fl_stats->fs_oerrors;
+
+	return (exacct_commit_netinfo((void *)&nstat, EX_NET_FLSTAT_REC));
+}
+
+/* Write the Link Description to the log file */
+int
+mac_write_link_desc(mac_client_impl_t *mcip)
+{
+	net_desc_t		ndesc;
+	flow_entry_t		*flent = mcip->mci_flent;
+
+	bzero(&ndesc, sizeof (net_desc_t));
+
+	ndesc.nd_name = mcip->mci_name;
+	ndesc.nd_devname = mcip->mci_name;
+	ndesc.nd_isv4 = B_TRUE;
+	/*
+	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
+	 * Updates to the fe_flow_desc are done under the fe_lock
+	 * after removing the flent from the flow table.
+	 */
+	mutex_enter(&flent->fe_lock);
+	bcopy(flent->fe_flow_desc.fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
+	mutex_exit(&flent->fe_lock);
+
+	return (exacct_commit_netinfo((void *)&ndesc, EX_NET_LNDESC_REC));
+}
+
+/* Write the Link statistics to the log file */
+int
+mac_write_link_stats(mac_client_impl_t *mcip)
+{
+	net_stat_t	nstat;
+
+	nstat.ns_name = mcip->mci_name;
+	nstat.ns_ibytes = mcip->mci_stat_ibytes;
+	nstat.ns_obytes = mcip->mci_stat_obytes;
+	nstat.ns_ipackets = mcip->mci_stat_ipackets;
+	nstat.ns_opackets = mcip->mci_stat_opackets;
+	nstat.ns_ierrors = mcip->mci_stat_ierrors;
+	nstat.ns_oerrors = mcip->mci_stat_oerrors;
+
+	return (exacct_commit_netinfo((void *)&nstat, EX_NET_LNSTAT_REC));
+}
+
+/*
+ * For a given flow, if the descrition has not been logged before, do it now.
+ * If it is a VNIC, then we have collected information about it from the MAC
+ * table, so skip it.
+ */
+/*ARGSUSED*/
+static int
+mac_log_flowinfo(flow_entry_t *flent, void *args)
+{
+	mac_client_impl_t	*mcip = flent->fe_mcip;
+
+	if (mcip == NULL)
+		return (0);
+
+	/*
+	 * If the name starts with "vnic", and fe_user_generated is true (to
+	 * exclude the mcast and active flow entries created implicitly for
+	 * a vnic, it is a VNIC flow.  i.e. vnic1 is a vnic flow,
+	 * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
+	 */
+	if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
+	    (flent->fe_type & FLOW_USER) != 0) {
+		return (0);
+	}
+
+	if (!flent->fe_desc_logged) {
+		/*
+		 * We don't return error because we want to continu the
+		 * walk in case this is the last walk which means we
+		 * need to reset fe_desc_logged in all the flows.
+		 */
+		if (mac_write_flow_desc(flent, mcip) != 0)
+			return (0);
+		flent->fe_desc_logged = B_TRUE;
+	}
+
+	/*
+	 * Regardless of the error, we want to proceed in case we have to
+	 * reset fe_desc_logged.
+	 */
+	(void) mac_write_flow_stats(flent);
+
+	if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
+		flent->fe_desc_logged = B_FALSE;
+
+	return (0);
+}
+
+typedef struct i_mac_log_state_s {
+	boolean_t	mi_last;
+	int		mi_fenable;
+	int		mi_lenable;
+} i_mac_log_state_t;
+
+/*
+ * Walk the mac_impl_ts and log the description for each mac client of this mac,
+ * if it hasn't already been done. Additionally, log statistics for the link as
+ * well. Walk the flow table and log information for each flow as well.
+ * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
+ * also fe_desc_logged, if flow logging is on) since we want to log the
+ * description if and when logging is restarted.
+ */
+/*ARGSUSED*/
+static uint_t
+i_mac_log_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
+{
+	mac_impl_t		*mip = (mac_impl_t *)val;
+	i_mac_log_state_t	*lstate = (i_mac_log_state_t *)arg;
+	int			ret;
+	mac_client_impl_t	*mcip;
+
+	/*
+	 * Only walk the client list for NIC and etherstub
+	 */
+	if ((mip->mi_state_flags & MIS_DISABLED) ||
+	    ((mip->mi_state_flags & MIS_IS_VNIC) &&
+	    (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL)))
+		return (MH_WALK_CONTINUE);
+
+	for (mcip = mip->mi_clients_list; mcip != NULL;
+	    mcip = mcip->mci_client_next) {
+		if (!MCIP_DATAPATH_SETUP(mcip))
+			continue;
+		if (lstate->mi_lenable) {
+			if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
+				ret = mac_write_link_desc(mcip);
+				if (ret != 0) {
+				/*
+				 * We can't terminate it if this is the last
+				 * walk, else there might be some links with
+				 * mi_desc_logged set to true, which means
+				 * their description won't be logged the next
+				 * time logging is started (similarly for the
+				 * flows within such links). We can continue
+				 * without walking the flow table (i.e. to
+				 * set fe_desc_logged to false) because we
+				 * won't have written any flow stuff for this
+				 * link as we haven't logged the link itself.
+				 */
+					if (lstate->mi_last)
+						return (MH_WALK_CONTINUE);
+					else
+						return (MH_WALK_TERMINATE);
+				}
+				mcip->mci_state_flags |= MCIS_DESC_LOGGED;
+			}
+		}
+
+		if (mac_write_link_stats(mcip) != 0 && !lstate->mi_last)
+			return (MH_WALK_TERMINATE);
+
+		if (lstate->mi_last)
+			mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
+
+		if (lstate->mi_fenable) {
+			if (mcip->mci_subflow_tab != NULL) {
+				(void) mac_flow_walk(mcip->mci_subflow_tab,
+				    mac_log_flowinfo, mip);
+			}
+		}
+	}
+	return (MH_WALK_CONTINUE);
+}
+
+/*
+ * The timer thread that runs every mac_logging_interval seconds and logs
+ * link and/or flow information.
+ */
+/* ARGSUSED */
+void
+mac_log_linkinfo(void *arg)
+{
+	i_mac_log_state_t	lstate;
+
+	rw_enter(&i_mac_impl_lock, RW_READER);
+	if (!mac_flow_log_enable && !mac_link_log_enable) {
+		rw_exit(&i_mac_impl_lock);
+		return;
+	}
+	lstate.mi_fenable = mac_flow_log_enable;
+	lstate.mi_lenable = mac_link_log_enable;
+	lstate.mi_last = B_FALSE;
+	rw_exit(&i_mac_impl_lock);
+
+	mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
+
+	rw_enter(&i_mac_impl_lock, RW_WRITER);
+	if (mac_flow_log_enable || mac_link_log_enable) {
+		mac_logging_timer = timeout(mac_log_linkinfo, NULL,
+		    SEC_TO_TICK(mac_logging_interval));
+	}
+	rw_exit(&i_mac_impl_lock);
+}
+
+/*
+ * Start the logging timer.
+ */
+void
+mac_start_logusage(mac_logtype_t type, uint_t interval)
+{
+	rw_enter(&i_mac_impl_lock, RW_WRITER);
+	switch (type) {
+	case MAC_LOGTYPE_FLOW:
+		if (mac_flow_log_enable) {
+			rw_exit(&i_mac_impl_lock);
+			return;
+		}
+		mac_flow_log_enable = B_TRUE;
+		/* FALLTHRU */
+	case MAC_LOGTYPE_LINK:
+		if (mac_link_log_enable) {
+			rw_exit(&i_mac_impl_lock);
+			return;
+		}
+		mac_link_log_enable = B_TRUE;
+		break;
+	default:
+		ASSERT(0);
+	}
+	mac_logging_interval = interval;
+	rw_exit(&i_mac_impl_lock);
+	mac_log_linkinfo(NULL);
+}
+
+/*
+ * Stop the logging timer if both Link and Flow logging are turned off.
+ */
+void
+mac_stop_logusage(mac_logtype_t type)
+{
+	i_mac_log_state_t	lstate;
+
+	rw_enter(&i_mac_impl_lock, RW_WRITER);
+	lstate.mi_fenable = mac_flow_log_enable;
+	lstate.mi_lenable = mac_link_log_enable;
+
+	/* Last walk */
+	lstate.mi_last = B_TRUE;
+
+	switch (type) {
+	case MAC_LOGTYPE_FLOW:
+		if (lstate.mi_fenable) {
+			ASSERT(mac_link_log_enable);
+			mac_flow_log_enable = B_FALSE;
+			mac_link_log_enable = B_FALSE;
+			break;
+		}
+		/* FALLTHRU */
+	case MAC_LOGTYPE_LINK:
+		if (!lstate.mi_lenable || mac_flow_log_enable) {
+			rw_exit(&i_mac_impl_lock);
+			return;
+		}
+		mac_link_log_enable = B_FALSE;
+		break;
+	default:
+		ASSERT(0);
+	}
+	rw_exit(&i_mac_impl_lock);
+	(void) untimeout(mac_logging_timer);
+	mac_logging_timer = 0;
+
+	/* Last walk */
+	mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
+}
+
+/*
+ * Walk the rx and tx SRS/SRs for a flow and update the priority value.
+ */
+void
+mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
+{
+	pri_t			pri;
+	int			count;
+	mac_soft_ring_set_t	*mac_srs;
+
+	if (flent->fe_rx_srs_cnt <= 0)
+		return;
+
+	if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
+	    SRST_FLOW) {
+		pri = FLOW_PRIORITY(mcip->mci_min_pri,
+		    mcip->mci_max_pri,
+		    flent->fe_resource_props.mrp_priority);
+	} else {
+		pri = mcip->mci_max_pri;
+	}
+
+	for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
+		mac_srs = flent->fe_rx_srs[count];
+		mac_update_srs_priority(mac_srs, pri);
+	}
+	/*
+	 * If we have a Tx SRS, we need to modify all the threads associated
+	 * with it.
+	 */
+	if (flent->fe_tx_srs != NULL)
+		mac_update_srs_priority(flent->fe_tx_srs, pri);
+}
+
+/*
+ * RX and TX rings are reserved according to different semantics depending
+ * on the requests from the MAC clients and type of rings:
+ *
+ * On the Tx side, by default we reserve individual rings, independently from
+ * the groups.
+ *
+ * On the Rx side, the reservation is at the granularity of the group
+ * of rings, and used for v12n level 1 only. It has a special case for the
+ * primary client.
+ *
+ * If a share is allocated to a MAC client, we allocate a TX group and an
+ * RX group to the client, and assign TX rings and RX rings to these
+ * groups according to information gathered from the driver through
+ * the share capability.
+ *
+ * The foreseable evolution of Rx rings will handle v12n level 2 and higher
+ * to allocate individual rings out of a group and program the hw classifier
+ * based on IP address or higher level criteria.
+ */
+
+/*
+ * mac_reserve_tx_ring()
+ * Reserve a unused ring by marking it with MR_INUSE state.
+ * As reserved, the ring is ready to function.
+ *
+ * Notes for Hybrid I/O:
+ *
+ * If a specific ring is needed, it is specified through the desired_ring
+ * argument. Otherwise that argument is set to NULL.
+ * If the desired ring was previous allocated to another client, this
+ * function swaps it with a new ring from the group of unassigned rings.
+ */
+mac_ring_t *
+mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
+{
+	mac_group_t *group;
+	mac_ring_t *ring;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	if (mip->mi_tx_groups == NULL)
+		return (NULL);
+
+	/*
+	 * Find an available ring and start it before changing its status.
+	 * The unassigned rings are at the end of the mi_tx_groups
+	 * array.
+	 */
+	group = mip->mi_tx_groups + mip->mi_tx_group_count;
+
+	for (ring = group->mrg_rings; ring != NULL;
+	    ring = ring->mr_next) {
+		if (desired_ring == NULL) {
+			if (ring->mr_state == MR_FREE)
+				/* wanted any free ring and found one */
+				break;
+		} else {
+			mac_ring_t *sring;
+			mac_client_impl_t *client;
+			mac_soft_ring_set_t *srs;
+
+			if (ring != desired_ring)
+				/* wants a desired ring but this one ain't it */
+				continue;
+
+			if (ring->mr_state == MR_FREE)
+				break;
+
+			/*
+			 * Found the desired ring but it's already in use.
+			 * Swap it with a new ring.
+			 */
+
+			/* find the client which owns that ring */
+			for (client = mip->mi_clients_list; client != NULL;
+			    client = client->mci_client_next) {
+				srs = MCIP_TX_SRS(client);
+				if (srs != NULL && mac_tx_srs_ring_present(srs,
+				    desired_ring)) {
+					/* found our ring */
+					break;
+				}
+			}
+			ASSERT(client != NULL);
+
+			/*
+			 * Note that we cannot simply invoke the group
+			 * add/rem routines since the client doesn't have a
+			 * TX group. So we need to instead add/remove
+			 * the rings from the SRS.
+			 */
+			ASSERT(client->mci_share == NULL);
+
+			/* first quiece the client */
+			mac_tx_client_quiesce(client, SRS_QUIESCE);
+
+			/* give a new ring to the client... */
+			sring = mac_reserve_tx_ring(mip, NULL);
+			if (sring != NULL) {
+				/*
+				 * There are no other available ring
+				 * on that MAC instance. The client
+				 * will fallback to the shared TX
+				 * ring.
+				 *
+				 * XXX if the user required the client
+				 * to have a hardware transmit ring,
+				 * we need to ensure we don't remove
+				 * the last ring from the client.
+				 * In that case look for a repacement
+				 * ring from a client which does not
+				 * require a hardware ring, we could
+				 * add an argument to
+				 * mac_reserve_tx_ring() which causes
+				 * it to take a ring from such a client
+				 * even if the desired ring is NULL.
+				 * This will have to be done as part
+				 * of the fix for CR 6758935. If that still
+				 * fails, i.e. if all rings are allocated
+				 * to clients which require rings, then
+				 * cleanly fail the operation.
+				 */
+				mac_tx_srs_add_ring(srs, sring);
+			}
+
+			/* ... in exchange for our desired ring */
+			mac_tx_srs_del_ring(srs, desired_ring);
+
+			/* restart the client */
+			mac_tx_client_restart(client);
+
+			break;
+		}
+	}
+
+	if (ring != NULL) {
+		if (mac_start_ring(ring) != 0)
+			return (NULL);
+		ring->mr_state = MR_INUSE;
+	}
+
+	return (ring);
+}
+
+/*
+ * Minimum number of rings to leave in the default TX group when allocating
+ * rings to new clients.
+ */
+static uint_t mac_min_rx_default_rings = 1;
+
+/*
+ * Populate a zero-ring group with rings. If the share is non-NULL,
+ * the rings are chosen according to that share.
+ * Invoked after allocating a new RX or TX group through
+ * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively.
+ * Returns zero on success, an errno otherwise.
+ */
+int
+i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type,
+    mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share)
+{
+	mac_ring_t **rings, *tmp_ring[1], *ring;
+	uint_t nrings;
+	int rv, i, j;
+
+	ASSERT(mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC &&
+	    mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
+	ASSERT(new_group->mrg_cur_count == 0);
+
+	/*
+	 * First find the rings to allocate to the group.
+	 */
+	if (share != NULL) {
+		/* get rings through ms_squery() */
+		mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings);
+		ASSERT(nrings != 0);
+		rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t),
+		    KM_SLEEP);
+		mip->mi_share_capab.ms_squery(share, ring_type,
+		    (mac_ring_handle_t *)rings, &nrings);
+	} else {
+		/* this function is called for TX only with a share */
+		ASSERT(ring_type == MAC_RING_TYPE_RX);
+		/*
+		 * Pick one ring from default group.
+		 *
+		 * for now pick the second ring which requires the first ring
+		 * at index 0 to stay in the default group, since it is the
+		 * ring which carries the multicast traffic.
+		 * We need a better way for a driver to indicate this,
+		 * for example a per-ring flag.
+		 */
+		for (ring = src_group->mrg_rings; ring != NULL;
+		    ring = ring->mr_next) {
+			if (ring->mr_index != 0)
+				break;
+		}
+		ASSERT(ring != NULL);
+		nrings = 1;
+		tmp_ring[0] = ring;
+		rings = tmp_ring;
+	}
+
+	switch (ring_type) {
+	case MAC_RING_TYPE_RX:
+		if (src_group->mrg_cur_count - nrings <
+		    mac_min_rx_default_rings) {
+			/* we ran out of rings */
+			return (ENOSPC);
+		}
+
+		/* move receive rings to new group */
+		for (i = 0; i < nrings; i++) {
+			rv = mac_group_mov_ring(mip, new_group, rings[i]);
+			if (rv != 0) {
+				/* move rings back on failure */
+				for (j = 0; j < i; j++) {
+					(void) mac_group_mov_ring(mip,
+					    src_group, rings[j]);
+				}
+				return (rv);
+			}
+		}
+		break;
+
+	case MAC_RING_TYPE_TX: {
+		mac_ring_t *tmp_ring;
+
+		/* move the TX rings to the new group */
+		ASSERT(src_group == NULL);
+		for (i = 0; i < nrings; i++) {
+			/* get the desired ring */
+			tmp_ring = mac_reserve_tx_ring(mip, rings[i]);
+			ASSERT(tmp_ring == rings[i]);
+			rv = mac_group_mov_ring(mip, new_group, rings[i]);
+			if (rv != 0) {
+				/* cleanup on failure */
+				for (j = 0; j < i; j++) {
+					(void) mac_group_mov_ring(mip,
+					    mip->mi_tx_groups +
+					    mip->mi_tx_group_count, rings[j]);
+				}
+			}
+		}
+		break;
+	}
+	}
+
+	if (share != NULL) {
+		/* add group to share */
+		mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver);
+		/* free temporary array of rings */
+		kmem_free(rings, nrings * sizeof (mac_ring_handle_t));
+	}
+
+	return (0);
+}
+
+void
+mac_rx_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
+{
+	mac_grp_client_t *mgcp;
+
+	for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
+		if (mgcp->mgc_client == mcip)
+			break;
+	}
+
+	VERIFY(mgcp == NULL);
+
+	mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
+	mgcp->mgc_client = mcip;
+	mgcp->mgc_next = grp->mrg_clients;
+	grp->mrg_clients = mgcp;
+
+}
+
+void
+mac_rx_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
+{
+	mac_grp_client_t *mgcp, **pprev;
+
+	for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL;
+	    pprev = &mgcp->mgc_next, mgcp = *pprev) {
+		if (mgcp->mgc_client == mcip)
+			break;
+	}
+
+	ASSERT(mgcp != NULL);
+
+	*pprev = mgcp->mgc_next;
+	kmem_free(mgcp, sizeof (mac_grp_client_t));
+}
+
+/*
+ * mac_reserve_rx_group()
+ *
+ * Finds an available group and exclusively reserves it for a client.
+ * The group is chosen to suit the flow's resource controls (bandwidth and
+ * fanout requirements) and the address type.
+ * If the requestor is the pimary MAC then return the group with the
+ * largest number of rings, otherwise the default ring when available.
+ */
+mac_group_t *
+mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr,
+    mac_rx_group_reserve_type_t rtype)
+{
+	mac_share_handle_t	share = mcip->mci_share;
+	mac_impl_t		*mip = mcip->mci_mip;
+	mac_group_t		*grp = NULL;
+	int			i, start, loopcount;
+	int			err;
+	mac_address_t		*map;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	/* Check if a group already has this mac address (case of VLANs) */
+	if ((map = mac_find_macaddr(mip, mac_addr)) != NULL)
+		return (map->ma_group);
+
+	if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0 ||
+	    rtype == MAC_RX_NO_RESERVE)
+		return (NULL);
+
+	/*
+	 * Try to exclusively reserve a RX group.
+	 *
+	 * For flows requires SW_RING it always goes to the default group
+	 * (Until we can explicitely call out default groups (CR 6695600),
+	 * we assume that the default group is always at position zero);
+	 *
+	 * For flows requires HW_DEFAULT_RING (unicast flow of the primary
+	 * client), try to reserve the default RX group only.
+	 *
+	 * For flows requires HW_RING (unicast flow of other clients), try
+	 * to reserve non-default RX group then the default group.
+	 */
+	switch (rtype) {
+	case MAC_RX_RESERVE_DEFAULT:
+		start = 0;
+		loopcount = 1;
+		break;
+	case MAC_RX_RESERVE_NONDEFAULT:
+		start = 1;
+		loopcount = mip->mi_rx_group_count;
+	}
+
+	for (i = start; i < start + loopcount; i++) {
+		grp = &mip->mi_rx_groups[i % mip->mi_rx_group_count];
+
+		DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name,
+		    int, grp->mrg_index, mac_group_state_t, grp->mrg_state);
+
+		/*
+		 * Check to see whether this mac client is the only client
+		 * on this RX group. If not, we cannot exclusively reserve
+		 * this RX group.
+		 */
+		if (!MAC_RX_GROUP_NO_CLIENT(grp) &&
+		    (MAC_RX_GROUP_ONLY_CLIENT(grp) != mcip)) {
+			continue;
+		}
+
+		/*
+		 * This group could already be SHARED by other multicast
+		 * flows on this client. In that case, the group would
+		 * be shared and has already been started.
+		 */
+		ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT);
+
+		if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) &&
+		    (mac_start_group(grp) != 0)) {
+			continue;
+		}
+
+		if ((i % mip->mi_rx_group_count) == 0 ||
+		    mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC) {
+			break;
+		}
+
+		ASSERT(grp->mrg_cur_count == 0);
+
+		/*
+		 * Populate the group. Rings should be taken
+		 * from the default group at position 0 for now.
+		 */
+
+		err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
+		    &mip->mi_rx_groups[0], grp, share);
+		if (err == 0)
+			break;
+
+		DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
+		    mip->mi_name, int, grp->mrg_index, int, err);
+
+		/*
+		 * It's a dynamic group but the grouping operation failed.
+		 */
+		mac_stop_group(grp);
+	}
+
+	if (i == start + loopcount)
+		return (NULL);
+
+	ASSERT(grp != NULL);
+
+	DTRACE_PROBE2(rx__group__reserved,
+	    char *, mip->mi_name, int, grp->mrg_index);
+	return (grp);
+}
+
+/*
+ * mac_rx_release_group()
+ *
+ * This is called when there are no clients left for the group.
+ * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
+ * and if it is a non default group, the shares are removed and
+ * all rings are assigned back to default group.
+ */
+void
+mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
+{
+	mac_impl_t	*mip = mcip->mci_mip;
+	mac_ring_t	*ring;
+
+	ASSERT(group != &mip->mi_rx_groups[0]);
+
+	/*
+	 * This is the case where there are no clients left. Any
+	 * SRS etc on this group have also be quiesced.
+	 */
+	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
+		if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
+			ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
+			/*
+			 * Remove the SRS associated with the HW ring.
+			 * As a result, polling will be disabled.
+			 */
+			ring->mr_srs = NULL;
+		}
+		ASSERT(ring->mr_state == MR_INUSE);
+		mac_stop_ring(ring);
+		ring->mr_state = MR_FREE;
+		ring->mr_flag = 0;
+	}
+
+	/* remove group from share */
+	if (mcip->mci_share != NULL) {
+		mip->mi_share_capab.ms_sremove(mcip->mci_share,
+		    group->mrg_driver);
+	}
+
+	if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
+		mac_ring_t *ring;
+
+		/*
+		 * Rings were dynamically allocated to group.
+		 * Move rings back to default group.
+		 */
+		while ((ring = group->mrg_rings) != NULL) {
+			(void) mac_group_mov_ring(mip,
+			    &mip->mi_rx_groups[0], ring);
+		}
+	}
+	mac_stop_group(group);
+	/*
+	 * Possible improvement: See if we can assign the group just released
+	 * to a another client of the mip
+	 */
+}
+
+/*
+ * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup()
+ * when a share was allocated to the client.
+ */
+mac_group_t *
+mac_reserve_tx_group(mac_impl_t *mip, mac_share_handle_t share)
+{
+	mac_group_t *grp;
+	int rv, i;
+
+	/*
+	 * TX groups are currently allocated only to MAC clients
+	 * which are associated with a share. Since we have a fixed
+	 * number of share and groups, and we already successfully
+	 * allocated a share, find an available TX group.
+	 */
+	ASSERT(share != NULL);
+	ASSERT(mip->mi_tx_group_free > 0);
+
+	for (i = 0; i <  mip->mi_tx_group_count; i++) {
+		grp = &mip->mi_tx_groups[i];
+
+		if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) ||
+		    (grp->mrg_state == MAC_GROUP_STATE_UNINIT))
+			continue;
+
+		rv = mac_start_group(grp);
+		ASSERT(rv == 0);
+
+		grp->mrg_state = MAC_GROUP_STATE_RESERVED;
+		break;
+	}
+
+	ASSERT(grp != NULL);
+
+	/*
+	 * Populate the group. Rings should be taken from the group
+	 * of unassigned rings, which is past the array of TX
+	 * groups adversized by the driver.
+	 */
+	rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, NULL,
+	    grp, share);
+	if (rv != 0) {
+		DTRACE_PROBE3(tx__group__reserve__alloc__rings,
+		    char *, mip->mi_name, int, grp->mrg_index, int, rv);
+
+		mac_stop_group(grp);
+		grp->mrg_state = MAC_GROUP_STATE_UNINIT;
+
+		return (NULL);
+	}
+
+	mip->mi_tx_group_free--;
+
+	return (grp);
+}
+
+void
+mac_release_tx_group(mac_impl_t *mip, mac_group_t *grp)
+{
+	mac_client_impl_t *mcip = grp->mrg_tx_client;
+	mac_share_handle_t share = mcip->mci_share;
+	mac_ring_t *ring;
+
+	ASSERT(mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
+	ASSERT(share != NULL);
+	ASSERT(grp->mrg_state == MAC_GROUP_STATE_RESERVED);
+
+	mip->mi_share_capab.ms_sremove(share, grp->mrg_driver);
+	while ((ring = grp->mrg_rings) != NULL) {
+		/* move the ring back to the pool */
+		(void) mac_group_mov_ring(mip, mip->mi_tx_groups +
+		    mip->mi_tx_group_count, ring);
+	}
+	mac_stop_group(grp);
+	mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
+	grp->mrg_tx_client = NULL;
+	mip->mi_tx_group_free++;
+}
+
+/*
+ * This is a 1-time control path activity initiated by the client (IP).
+ * The mac perimeter protects against other simultaneous control activities,
+ * for example an ioctl that attempts to change the degree of fanout and
+ * increase or decrease the number of softrings associated with this Tx SRS.
+ */
+static mac_tx_notify_cb_t *
+mac_client_tx_notify_add(mac_client_impl_t *mcip,
+    mac_tx_notify_t notify, void *arg)
+{
+	mac_cb_info_t *mcbi;
+	mac_tx_notify_cb_t *mtnfp;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+	mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP);
+	mtnfp->mtnf_fn = notify;
+	mtnfp->mtnf_arg = arg;
+	mtnfp->mtnf_link.mcb_objp = mtnfp;
+	mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t);
+	mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T;
+
+	mcbi = &mcip->mci_tx_notify_cb_info;
+	mutex_enter(mcbi->mcbi_lockp);
+	mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link);
+	mutex_exit(mcbi->mcbi_lockp);
+	return (mtnfp);
+}
+
+static void
+mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp)
+{
+	mac_cb_info_t	*mcbi;
+	mac_cb_t	**cblist;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+	if (!mac_callback_find(&mcip->mci_tx_notify_cb_info,
+	    &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) {
+		cmn_err(CE_WARN,
+		    "mac_client_tx_notify_remove: callback not "
+		    "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp);
+		return;
+	}
+
+	mcbi = &mcip->mci_tx_notify_cb_info;
+	cblist = &mcip->mci_tx_notify_cb_list;
+	mutex_enter(mcbi->mcbi_lockp);
+	if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link))
+		kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t));
+	else
+		mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info);
+	mutex_exit(mcbi->mcbi_lockp);
+}
+
+/*
+ * mac_client_tx_notify():
+ * call to add and remove flow control callback routine.
+ */
+mac_tx_notify_handle_t
+mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func,
+    void *ptr)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_tx_notify_cb_t	*mtnfp = NULL;
+
+	i_mac_perim_enter(mcip->mci_mip);
+
+	if (callb_func != NULL) {
+		/* Add a notify callback */
+		mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr);
+	} else {
+		mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr);
+	}
+	i_mac_perim_exit(mcip->mci_mip);
+
+	return ((mac_tx_notify_handle_t)mtnfp);
+}
diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c
new file mode 100644
index 0000000000..5fd2a6ef55
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_bcast.c
@@ -0,0 +1,668 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/list.h>
+#include <sys/kmem.h>
+#include <sys/stream.h>
+#include <sys/modctl.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/atomic.h>
+#include <sys/stat.h>
+#include <sys/modhash.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/sdt.h>
+#include <sys/mac.h>
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_flow_impl.h>
+
+/*
+ * Broadcast and multicast traffic must be distributed to the MAC clients
+ * that are defined on top of the same MAC. The set of
+ * destinations to which a multicast packet must be sent is a subset
+ * of all MAC clients defined on top of the MAC. A MAC client can be member
+ * of more than one such subset.
+ *
+ * To accomodate these requirements, we introduce broadcast groups.
+ * A broadcast group is associated with a broadcast or multicast
+ * address. The members of a broadcast group consist of the MAC clients
+ * that should received copies of packets sent to the address
+ * associated with the group, and are defined on top of the
+ * same MAC.
+ *
+ * The broadcast groups defined on top of a MAC are chained,
+ * hanging off the mac_impl_t. The broadcast group id's are
+ * unique globally (tracked by mac_bcast_id).
+ */
+
+/*
+ * The same MAC client may be added for different <addr,vid> tuple,
+ * we maintain a ref count for the number of times it has been added
+ * to account for deleting the MAC client from the group.
+ */
+typedef struct mac_bcast_grp_mcip_s {
+	mac_client_impl_t	*mgb_client;
+	int			mgb_client_ref;
+} mac_bcast_grp_mcip_t;
+
+typedef struct mac_bcast_grp_s {			/* Protected by */
+	struct mac_bcast_grp_s	*mbg_next;		/* SL */
+	void			*mbg_addr;		/* SL */
+	uint16_t		mbg_vid;		/* SL */
+	mac_impl_t		*mbg_mac_impl;		/* WO */
+	mac_addrtype_t		mbg_addrtype;		/* WO */
+	flow_entry_t		*mbg_flow_ent;		/* WO */
+	mac_bcast_grp_mcip_t	*mbg_clients;		/* mi_rw_lock */
+	uint_t			mbg_nclients;		/* mi_rw_lock */
+	uint_t			mbg_nclients_alloc;	/* SL */
+	uint64_t		mbg_clients_gen;	/* mi_rw_lock */
+	uint32_t		mbg_id;			/* atomic */
+} mac_bcast_grp_t;
+
+static kmem_cache_t *mac_bcast_grp_cache;
+static uint32_t mac_bcast_id = 0;
+
+void
+mac_bcast_init(void)
+{
+	mac_bcast_grp_cache = kmem_cache_create("mac_bcast_grp_cache",
+	    sizeof (mac_bcast_grp_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+mac_bcast_fini(void)
+{
+	kmem_cache_destroy(mac_bcast_grp_cache);
+}
+
+mac_impl_t *
+mac_bcast_grp_mip(void *grp)
+{
+	mac_bcast_grp_t *bcast_grp = grp;
+
+	return (bcast_grp->mbg_mac_impl);
+}
+
+/*
+ * Free the specific broadcast group. Invoked when the last reference
+ * to the group is released.
+ */
+void
+mac_bcast_grp_free(void *bcast_grp)
+{
+	mac_bcast_grp_t	*grp = bcast_grp;
+	mac_impl_t *mip = grp->mbg_mac_impl;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) {
+		/*
+		 * The address is a multicast address, have the
+		 * underlying NIC leave the multicast group.
+		 */
+		(void) mip->mi_multicst(mip->mi_driver, B_FALSE, grp->mbg_addr);
+	}
+
+	ASSERT(grp->mbg_addr != NULL);
+	kmem_free(grp->mbg_addr, mip->mi_type->mt_addr_length);
+	kmem_free(grp->mbg_clients,
+	    grp->mbg_nclients_alloc * sizeof (mac_bcast_grp_mcip_t));
+	mip->mi_bcast_ngrps--;
+	kmem_cache_free(mac_bcast_grp_cache, grp);
+}
+
+/*
+ * arg1: broadcast group
+ * arg2: sender MAC client if it is being sent by a MAC client,
+ * NULL if it was received from the wire.
+ */
+void
+mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
+{
+	mac_bcast_grp_t *grp = arg1;
+	mac_client_impl_t *src_mcip = arg2, *dst_mcip;
+	mac_impl_t *mip = grp->mbg_mac_impl;
+	uint64_t gen;
+	uint_t i;
+	mblk_t *mp_chain1;
+	flow_entry_t	*flent;
+	int err;
+
+	rw_enter(&mip->mi_rw_lock, RW_READER);
+
+	/*
+	 * Pass a copy of the mp chain to every MAC client except the sender
+	 * MAC client, if the packet was not received from the underlying NIC.
+	 *
+	 * The broadcast group lock should not be held across calls to
+	 * the flow's callback function, since the same group could
+	 * potentially be accessed from the same context. When the lock
+	 * is reacquired, changes to the broadcast group while the lock
+	 * was released are caught using a generation counter incremented
+	 * each time the list of MAC clients associated with the broadcast
+	 * group is changed.
+	 */
+	for (i = 0; i < grp->mbg_nclients_alloc; i++) {
+		dst_mcip = grp->mbg_clients[i].mgb_client;
+		if (dst_mcip == NULL)
+			continue;
+		flent = dst_mcip->mci_flent;
+		if (flent == NULL || dst_mcip == src_mcip) {
+			/*
+			 * Don't send a copy of the packet back to
+			 * its sender.
+			 */
+			continue;
+		}
+
+		/*
+		 * It is important to hold a reference on the
+		 * flow_ent here.
+		 */
+		if ((mp_chain1 = mac_copymsgchain_cksum(mp_chain)) == NULL)
+			break;
+		/*
+		 * Fix the checksum for packets originating
+		 * from the local machine.
+		 */
+		if ((src_mcip != NULL) &&
+		    (mp_chain1 = mac_fix_cksum(mp_chain1)) == NULL)
+			break;
+
+		FLOW_TRY_REFHOLD(flent, err);
+		if (err != 0) {
+			freemsgchain(mp_chain1);
+			continue;
+		}
+
+		gen = grp->mbg_clients_gen;
+
+		rw_exit(&mip->mi_rw_lock);
+
+		DTRACE_PROBE4(mac__bcast__send__to, mac_client_impl_t *,
+		    src_mcip, flow_fn_t, dst_mcip->mci_flent->fe_cb_fn,
+		    void *, dst_mcip->mci_flent->fe_cb_arg1,
+		    void *, dst_mcip->mci_flent->fe_cb_arg2);
+
+		(dst_mcip->mci_flent->fe_cb_fn)(dst_mcip->mci_flent->fe_cb_arg1,
+		    dst_mcip->mci_flent->fe_cb_arg2, mp_chain1, is_loopback);
+		FLOW_REFRELE(flent);
+
+		rw_enter(&mip->mi_rw_lock, RW_READER);
+
+		/* update stats */
+		if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST)
+			dst_mcip->mci_stat_multircv++;
+		else
+			dst_mcip->mci_stat_brdcstrcv++;
+
+		if (grp->mbg_clients_gen != gen) {
+			/*
+			 * The list of MAC clients associated with the group
+			 * was changed while the lock was released.
+			 * Give up on the current packet.
+			 */
+			rw_exit(&mip->mi_rw_lock);
+			freemsgchain(mp_chain);
+			return;
+		}
+	}
+	rw_exit(&mip->mi_rw_lock);
+
+	if (src_mcip != NULL) {
+		/*
+		 * The packet was sent from one of the MAC clients,
+		 * so we need to send a copy of the packet to the
+		 * underlying NIC so that it can be sent on the wire.
+		 */
+		mblk_t *rest;
+
+		src_mcip->mci_stat_multixmt++;
+		src_mcip->mci_stat_brdcstxmt++;
+
+		rest = MAC_RING_TX_DEFAULT(mip, mp_chain);
+		if (rest != NULL)
+			freemsgchain(rest);
+	} else {
+		freemsgchain(mp_chain);
+	}
+}
+
+/*
+ * Add the specified MAC client to the group corresponding to the specified
+ * broadcast or multicast address.
+ * Return 0 on success, or an errno value on failure.
+ */
+int
+mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid,
+    mac_addrtype_t addrtype)
+{
+	mac_impl_t 		*mip = mcip->mci_mip;
+	mac_bcast_grp_t		*grp = NULL, **last_grp;
+	size_t			addr_len = mip->mi_type->mt_addr_length;
+	int			rc = 0;
+	int			i, index = -1;
+	mac_mcast_addrs_t	*mci_maddr = NULL;
+	mac_mcast_addrs_t	*mi_maddr = NULL;
+	mac_mcast_addrs_t	**last_maddr;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	ASSERT(addrtype == MAC_ADDRTYPE_MULTICAST ||
+	    addrtype == MAC_ADDRTYPE_BROADCAST);
+
+	/* The list is protected by the perimeter */
+	last_grp = &mip->mi_bcast_grp;
+	for (grp = *last_grp; grp != NULL;
+	    last_grp = &grp->mbg_next, grp = grp->mbg_next) {
+		if (bcmp(grp->mbg_addr, addr, addr_len) == 0 &&
+		    grp->mbg_vid == vid)
+			break;
+	}
+
+	if (grp == NULL) {
+		/*
+		 * The group does not yet exist, create it.
+		 */
+		flow_desc_t flow_desc;
+		char flow_name[MAXFLOWNAME];
+
+		grp = kmem_cache_alloc(mac_bcast_grp_cache, KM_SLEEP);
+		bzero(grp, sizeof (mac_bcast_grp_t));
+		grp->mbg_next = NULL;
+		grp->mbg_mac_impl = mip;
+
+		DTRACE_PROBE1(mac__bcast__add__new__group, mac_bcast_grp_t *,
+		    grp);
+
+		grp->mbg_addr = kmem_zalloc(addr_len, KM_SLEEP);
+		bcopy(addr, grp->mbg_addr, addr_len);
+		grp->mbg_addrtype = addrtype;
+		grp->mbg_vid = vid;
+
+		/*
+		 * Add a new flow to the underlying MAC.
+		 */
+		bzero(&flow_desc, sizeof (flow_desc));
+		bcopy(addr, &flow_desc.fd_dst_mac, addr_len);
+		flow_desc.fd_mac_len = (uint32_t)addr_len;
+
+		flow_desc.fd_mask = FLOW_LINK_DST;
+		if (vid != 0) {
+			flow_desc.fd_vid = vid;
+			flow_desc.fd_mask |= FLOW_LINK_VID;
+		}
+
+		grp->mbg_id = atomic_add_32_nv(&mac_bcast_id, 1);
+		(void) sprintf(flow_name,
+		    "mac/%s/mcast%d", mip->mi_name, grp->mbg_id);
+
+		rc = mac_flow_create(&flow_desc, NULL, flow_name,
+		    grp, FLOW_MCAST, &grp->mbg_flow_ent);
+		if (rc != 0) {
+			kmem_free(grp->mbg_addr, addr_len);
+			kmem_cache_free(mac_bcast_grp_cache, grp);
+			return (rc);
+		}
+		grp->mbg_flow_ent->fe_mbg = grp;
+		mip->mi_bcast_ngrps++;
+
+		/*
+		 * Initial creation reference on the flow. This is released
+		 * in the corresponding delete action i_mac_bcast_delete()
+		 */
+		FLOW_REFHOLD(grp->mbg_flow_ent);
+
+		/*
+		 * When the multicast and broadcast packet is received
+		 * by the underlying NIC, mac_rx_classify() will invoke
+		 * mac_bcast_send() with arg2=NULL, which will cause
+		 * mac_bcast_send() to send a copy of the packet(s)
+		 * to every MAC client opened on top of the underlying MAC.
+		 *
+		 * When the mac_bcast_send() function is invoked from
+		 * the transmit path of a MAC client, it will specify the
+		 * transmitting MAC client as the arg2 value, which will
+		 * allow mac_bcast_send() to skip that MAC client and not
+		 * send it a copy of the packet.
+		 *
+		 * We program the classifier to dispatch matching broadcast
+		 * packets to mac_bcast_send().
+		 */
+
+		grp->mbg_flow_ent->fe_cb_fn = mac_bcast_send;
+		grp->mbg_flow_ent->fe_cb_arg1 = grp;
+		grp->mbg_flow_ent->fe_cb_arg2 = NULL;
+
+		rc = mac_flow_add(mip->mi_flow_tab, grp->mbg_flow_ent);
+		if (rc != 0) {
+			FLOW_FINAL_REFRELE(grp->mbg_flow_ent);
+			return (rc);
+		}
+
+		/*
+		 * For multicast addresses, have the underlying MAC
+		 * join the corresponsing multicast group.
+		 */
+		if (addrtype == MAC_ADDRTYPE_MULTICAST) {
+			rc = mip->mi_multicst(mip->mi_driver, B_TRUE, addr);
+			if (rc != 0) {
+				mac_flow_remove(mip->mi_flow_tab,
+				    grp->mbg_flow_ent, B_FALSE);
+				mac_flow_wait(grp->mbg_flow_ent,
+				    FLOW_DRIVER_UPCALL);
+				FLOW_FINAL_REFRELE(grp->mbg_flow_ent);
+				return (rc);
+			}
+		}
+
+		*last_grp = grp;
+	}
+
+	ASSERT(grp->mbg_addrtype == addrtype);
+
+	/*
+	 * Add the MAC client to the list of MAC clients associated
+	 * with the group.
+	 */
+	rw_enter(&mip->mi_rw_lock, RW_WRITER);
+	if (addrtype == MAC_ADDRTYPE_MULTICAST) {
+		/*
+		 * We maintain a separate list for each MAC client. Get
+		 * the entry or add, if it is not present.
+		 */
+		last_maddr = &mcip->mci_mcast_addrs;
+		for (mci_maddr = *last_maddr; mci_maddr != NULL;
+		    last_maddr = &mci_maddr->mma_next,
+		    mci_maddr = mci_maddr->mma_next) {
+			if (bcmp(mci_maddr->mma_addr, addr, addr_len) == 0)
+				break;
+		}
+		if (mci_maddr == NULL) {
+			mci_maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t),
+			    KM_SLEEP);
+			bcopy(addr, mci_maddr->mma_addr, addr_len);
+			*last_maddr = mci_maddr;
+		}
+		mci_maddr->mma_ref++;
+
+		/*
+		 * In case of a driver (say aggr), we also need this
+		 * information on a per MAC instance basis.
+		 */
+		last_maddr = &mip->mi_mcast_addrs;
+		for (mi_maddr = *last_maddr; mi_maddr != NULL;
+		    last_maddr = &mi_maddr->mma_next,
+		    mi_maddr = mi_maddr->mma_next) {
+			if (bcmp(mi_maddr->mma_addr, addr, addr_len) == 0)
+				break;
+		}
+		if (mi_maddr == NULL) {
+			mi_maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t),
+			    KM_SLEEP);
+			bcopy(addr, mi_maddr->mma_addr, addr_len);
+			*last_maddr = mi_maddr;
+		}
+		mi_maddr->mma_ref++;
+	}
+	for (i = 0; i < grp->mbg_nclients_alloc; i++) {
+		/*
+		 * The MAC client was already added, say when we have
+		 * different unicast addresses with the same vid.
+		 * Just increment the ref and we are done.
+		 */
+		if (grp->mbg_clients[i].mgb_client == mcip) {
+			grp->mbg_clients[i].mgb_client_ref++;
+			goto add_done;
+		} else if (grp->mbg_clients[i].mgb_client == NULL &&
+		    index == -1) {
+			index = i;
+		}
+	}
+	if (grp->mbg_nclients_alloc == grp->mbg_nclients) {
+		mac_bcast_grp_mcip_t	*new_clients;
+		uint_t			new_size = grp->mbg_nclients+1;
+
+		new_clients = kmem_zalloc(new_size *
+		    sizeof (mac_bcast_grp_mcip_t), KM_SLEEP);
+
+		if (grp->mbg_nclients > 0) {
+			ASSERT(grp->mbg_clients != NULL);
+			bcopy(grp->mbg_clients, new_clients, grp->mbg_nclients *
+			    sizeof (mac_bcast_grp_mcip_t));
+			kmem_free(grp->mbg_clients, grp->mbg_nclients *
+			    sizeof (mac_bcast_grp_mcip_t));
+		}
+
+		grp->mbg_clients = new_clients;
+		grp->mbg_nclients_alloc = new_size;
+		index = new_size - 1;
+	}
+
+	ASSERT(index != -1);
+	grp->mbg_clients[index].mgb_client = mcip;
+	grp->mbg_clients[index].mgb_client_ref = 1;
+	grp->mbg_nclients++;
+	/*
+	 * Since we're adding to the list of MAC clients using that group,
+	 * kick the generation count, which will allow mac_bcast_send()
+	 * to detect that condition after re-acquiring the lock.
+	 */
+	grp->mbg_clients_gen++;
+add_done:
+	rw_exit(&mip->mi_rw_lock);
+
+	return (0);
+}
+
+/*
+ * Remove the specified MAC client from the group corresponding to
+ * the specific broadcast or multicast address.
+ *
+ * Note: mac_bcast_delete() calls  mac_remove_flow() which
+ * will call cv_wait for fe_refcnt to drop to 0. So this function
+ * should not be called from interrupt or STREAMS context.
+ */
+void
+mac_bcast_delete(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid)
+{
+	mac_impl_t *mip = mcip->mci_mip;
+	mac_bcast_grp_t *grp = NULL, **prev;
+	size_t addr_len = mip->mi_type->mt_addr_length;
+	flow_entry_t *flent;
+	uint_t i;
+	mac_mcast_addrs_t	*maddr = NULL;
+	mac_mcast_addrs_t	**mprev;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	/* find the broadcast group. The list is protected by the perimeter */
+	prev = &mip->mi_bcast_grp;
+	for (grp = mip->mi_bcast_grp; grp != NULL; prev = &grp->mbg_next,
+	    grp = grp->mbg_next) {
+		if (bcmp(grp->mbg_addr, addr, addr_len) == 0 &&
+		    grp->mbg_vid == vid)
+			break;
+	}
+	ASSERT(grp != NULL);
+
+	/*
+	 * Remove the MAC client from the list of MAC clients associated
+	 * with that broadcast group.
+	 *
+	 * We mark the mbg_clients[] location corresponding to the removed MAC
+	 * client NULL and reuse that location when we add a new MAC client.
+	 */
+
+	rw_enter(&mip->mi_rw_lock, RW_WRITER);
+
+	for (i = 0; i < grp->mbg_nclients_alloc; i++) {
+		if (grp->mbg_clients[i].mgb_client == mcip)
+			break;
+	}
+
+	ASSERT(i < grp->mbg_nclients_alloc);
+	/*
+	 * If there are more references to this MAC client, then we let
+	 * it remain till it goes to 0.
+	 */
+	if (--grp->mbg_clients[i].mgb_client_ref > 0)
+		goto update_maddr;
+
+	grp->mbg_clients[i].mgb_client = NULL;
+	grp->mbg_clients[i].mgb_client_ref = 0;
+
+	/*
+	 * Since we're removing from the list of MAC clients using that group,
+	 * kick the generation count, which will allow mac_bcast_send()
+	 * to detect that condition.
+	 */
+	grp->mbg_clients_gen++;
+
+	if (--grp->mbg_nclients == 0) {
+		/*
+		 * The last MAC client of the group was just removed.
+		 * Unlink the current group from the list of groups
+		 * defined on top of the underlying NIC. The group
+		 * structure will stay around until the last reference
+		 * is dropped.
+		 */
+		*prev = grp->mbg_next;
+	}
+update_maddr:
+	if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) {
+		mprev = &mcip->mci_mcast_addrs;
+		for (maddr = mcip->mci_mcast_addrs; maddr != NULL;
+		    mprev = &maddr->mma_next, maddr = maddr->mma_next) {
+			if (bcmp(grp->mbg_addr, maddr->mma_addr,
+			    mip->mi_type->mt_addr_length) == 0)
+				break;
+		}
+		ASSERT(maddr != NULL);
+		if (--maddr->mma_ref == 0) {
+			*mprev = maddr->mma_next;
+			maddr->mma_next = NULL;
+			kmem_free(maddr, sizeof (mac_mcast_addrs_t));
+		}
+
+		mprev = &mip->mi_mcast_addrs;
+		for (maddr = mip->mi_mcast_addrs; maddr != NULL;
+		    mprev = &maddr->mma_next, maddr = maddr->mma_next) {
+			if (bcmp(grp->mbg_addr, maddr->mma_addr,
+			    mip->mi_type->mt_addr_length) == 0)
+				break;
+		}
+		ASSERT(maddr != NULL);
+		if (--maddr->mma_ref == 0) {
+			*mprev = maddr->mma_next;
+			maddr->mma_next = NULL;
+			kmem_free(maddr, sizeof (mac_mcast_addrs_t));
+		}
+	}
+	rw_exit(&mip->mi_rw_lock);
+
+	/*
+	 * If the group itself is being removed, remove the
+	 * corresponding flow from the underlying NIC.
+	 */
+	flent = grp->mbg_flow_ent;
+	if (grp->mbg_nclients == 0) {
+		mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE);
+		mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
+		FLOW_FINAL_REFRELE(flent);
+	}
+}
+
+/*
+ * This will be called by a driver, such as aggr, when a port is added/removed
+ * to add/remove the port to/from all the multcast addresses for that aggr.
+ */
+void
+mac_bcast_refresh(mac_impl_t *mip, mac_multicst_t refresh_fn, void *arg,
+    boolean_t add)
+{
+	mac_mcast_addrs_t *grp, *next;
+
+	ASSERT(refresh_fn != NULL);
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	/*
+	 * Walk the multicast address list and call the refresh function for
+	 * each address.
+	 */
+
+	for (grp = mip->mi_mcast_addrs; grp != NULL; grp = next) {
+		/*
+		 * Save the next pointer just in case the refresh
+		 * function's action causes the group entry to be
+		 * freed.
+		 * We won't be adding to this list as part of the
+		 * refresh.
+		 */
+		next = grp->mma_next;
+		refresh_fn(arg, add, grp->mma_addr);
+	}
+}
+
+/*
+ * Walk the MAC client's multicast address list and add/remove the addr/vid
+ * ('arg' is 'flent') to all the addresses.
+ */
+void
+mac_client_bcast_refresh(mac_client_impl_t *mcip, mac_multicst_t refresh_fn,
+    void *arg, boolean_t add)
+{
+	mac_mcast_addrs_t *grp, *next;
+	mac_impl_t		*mip = mcip->mci_mip;
+
+	ASSERT(refresh_fn != NULL);
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+	/*
+	 * Walk the multicast address list and call the refresh function for
+	 * each address.
+	 * Broadcast addresses are not added or removed through the multicast
+	 * entry points, so don't include them as part of the refresh.
+	 */
+	for (grp = mcip->mci_mcast_addrs; grp != NULL; grp = next) {
+		/*
+		 * Save the next pointer just in case the refresh
+		 * function's action causes the group entry to be
+		 * freed.
+		 * We won't be adding to this list as part of the
+		 * refresh.
+		 */
+		next = grp->mma_next;
+		refresh_fn(arg, add, grp->mma_addr);
+	}
+}
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
new file mode 100644
index 0000000000..bd6b552e67
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -0,0 +1,3763 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * - General Introduction:
+ *
+ * This file contains the implementation of the MAC client kernel
+ * API and related code. The MAC client API allows a kernel module
+ * to gain access to a MAC instance (physical NIC, link aggregation, etc).
+ * It allows a MAC client to associate itself with a MAC address,
+ * VLANs, callback functions for data traffic and for promiscuous mode.
+ * The MAC client API is also used to specify the properties associated
+ * with a MAC client, such as bandwidth limits, priority, CPUS, etc.
+ * These properties are further used to determine the hardware resources
+ * to allocate to the various MAC clients.
+ *
+ * - Primary MAC clients:
+ *
+ * The MAC client API refers to "primary MAC clients". A primary MAC
+ * client is a client which "owns" the primary MAC address of
+ * the underlying MAC instance. The primary MAC address is called out
+ * since it is associated with specific semantics: the primary MAC
+ * address is the MAC address which is assigned to the IP interface
+ * when it is plumbed, and the primary MAC address is assigned
+ * to VLAN data-links. The primary address of a MAC instance can
+ * also change dynamically from under the MAC client, for example
+ * as a result of a change of state of a link aggregation. In that
+ * case the MAC layer automatically updates all data-structures which
+ * refer to the current value of the primary MAC address. Typical
+ * primary MAC clients are dls, aggr, and xnb. A typical non-primary
+ * MAC client is the vnic driver.
+ *
+ * - Virtual Switching:
+ *
+ * The MAC layer implements a virtual switch between the MAC clients
+ * (primary and non-primary) defined on top of the same underlying
+ * NIC (physical, link aggregation, etc). The virtual switch is
+ * VLAN-aware, i.e. it allows multiple MAC clients to be member
+ * of one or more VLANs, and the virtual switch will distribute
+ * multicast tagged packets only to the member of the corresponding
+ * VLANs.
+ *
+ * - Upper vs Lower MAC:
+ *
+ * Creating a VNIC on top of a MAC instance effectively causes
+ * two MAC instances to be layered on top of each other, one for
+ * the VNIC(s), one for the underlying MAC instance (physical NIC,
+ * link aggregation, etc). In the code below we refer to the
+ * underlying NIC as the "lower MAC", and we refer to VNICs as
+ * the "upper MAC".
+ *
+ * - Pass-through for VNICs:
+ *
+ * When VNICs are created on top of an underlying MAC, this causes
+ * a layering of two MAC instances. Since the lower MAC already
+ * does the switching and demultiplexing to its MAC clients, the
+ * upper MAC would simply have to pass packets to the layer below
+ * or above it, which would introduce overhead. In order to avoid
+ * this overhead, the MAC layer implements a pass-through mechanism
+ * for VNICs. When a VNIC opens the lower MAC instance, it saves
+ * the MAC client handle it optains from the MAC layer. When a MAC
+ * client opens a VNIC (upper MAC), the MAC layer detects that
+ * the MAC being opened is a VNIC, and gets the MAC client handle
+ * that the VNIC driver obtained from the lower MAC. This exchange
+ * is doing through a private capability between the MAC layer
+ * and the VNIC driver. The upper MAC then returns that handle
+ * directly to its MAC client. Any operation done by the upper
+ * MAC client is now done on the lower MAC client handle, which
+ * allows the VNIC driver to be completely bypassed for the
+ * performance sensitive data-path.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/id_space.h>
+#include <sys/esunddi.h>
+#include <sys/stat.h>
+#include <sys/mkdev.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/strsubr.h>
+#include <sys/dlpi.h>
+#include <sys/modhash.h>
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/dls.h>
+#include <sys/dld.h>
+#include <sys/modctl.h>
+#include <sys/fs/dv_node.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/callb.h>
+#include <sys/cpuvar.h>
+#include <sys/atomic.h>
+#include <sys/sdt.h>
+#include <sys/mac_flow.h>
+#include <sys/ddi_intr_impl.h>
+#include <sys/disp.h>
+#include <sys/sdt.h>
+#include <sys/vnic.h>
+#include <sys/vnic_impl.h>
+#include <sys/vlan.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <sys/exacct.h>
+#include <sys/exacct_impl.h>
+#include <inet/nd.h>
+#include <sys/ethernet.h>
+
+kmem_cache_t	*mac_client_impl_cache;
+kmem_cache_t	*mac_promisc_impl_cache;
+
+static boolean_t mac_client_single_rcvr(mac_client_impl_t *);
+static flow_entry_t *mac_client_swap_mciflent(mac_client_impl_t *);
+static flow_entry_t *mac_client_get_flow(mac_client_impl_t *,
+    mac_unicast_impl_t *);
+static void mac_client_remove_flow_from_list(mac_client_impl_t *,
+    flow_entry_t *);
+static void mac_client_add_to_flow_list(mac_client_impl_t *, flow_entry_t *);
+static void mac_rename_flow_names(mac_client_impl_t *, const char *);
+static void mac_virtual_link_update(mac_impl_t *);
+
+/* ARGSUSED */
+static int
+i_mac_client_impl_ctor(void *buf, void *arg, int kmflag)
+{
+	int	i;
+	mac_client_impl_t	*mcip = buf;
+
+	bzero(buf, MAC_CLIENT_IMPL_SIZE);
+	mutex_init(&mcip->mci_tx_cb_lock, NULL, MUTEX_DRIVER, NULL);
+	mcip->mci_tx_notify_cb_info.mcbi_lockp = &mcip->mci_tx_cb_lock;
+
+	ASSERT(mac_tx_percpu_cnt >= 0);
+	for (i = 0; i <= mac_tx_percpu_cnt; i++) {
+		mutex_init(&mcip->mci_tx_pcpu[i].pcpu_tx_lock, NULL,
+		    MUTEX_DRIVER, NULL);
+	}
+	cv_init(&mcip->mci_tx_cv, NULL, CV_DRIVER, NULL);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+i_mac_client_impl_dtor(void *buf, void *arg)
+{
+	int	i;
+	mac_client_impl_t *mcip = buf;
+
+	ASSERT(mcip->mci_promisc_list == NULL);
+	ASSERT(mcip->mci_unicast_list == NULL);
+	ASSERT(mcip->mci_state_flags == 0);
+	ASSERT(mcip->mci_tx_flag == 0);
+
+	mutex_destroy(&mcip->mci_tx_cb_lock);
+
+	ASSERT(mac_tx_percpu_cnt >= 0);
+	for (i = 0; i <= mac_tx_percpu_cnt; i++) {
+		ASSERT(mcip->mci_tx_pcpu[i].pcpu_tx_refcnt == 0);
+		mutex_destroy(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
+	}
+	cv_destroy(&mcip->mci_tx_cv);
+}
+
+/* ARGSUSED */
+static int
+i_mac_promisc_impl_ctor(void *buf, void *arg, int kmflag)
+{
+	mac_promisc_impl_t	*mpip = buf;
+
+	bzero(buf, sizeof (mac_promisc_impl_t));
+	mpip->mpi_mci_link.mcb_objp = buf;
+	mpip->mpi_mci_link.mcb_objsize = sizeof (mac_promisc_impl_t);
+	mpip->mpi_mi_link.mcb_objp = buf;
+	mpip->mpi_mi_link.mcb_objsize = sizeof (mac_promisc_impl_t);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+i_mac_promisc_impl_dtor(void *buf, void *arg)
+{
+	mac_promisc_impl_t	*mpip = buf;
+
+	ASSERT(mpip->mpi_mci_link.mcb_objp != NULL);
+	ASSERT(mpip->mpi_mci_link.mcb_objsize == sizeof (mac_promisc_impl_t));
+	ASSERT(mpip->mpi_mi_link.mcb_objp == mpip->mpi_mci_link.mcb_objp);
+	ASSERT(mpip->mpi_mi_link.mcb_objsize == sizeof (mac_promisc_impl_t));
+
+	mpip->mpi_mci_link.mcb_objp = NULL;
+	mpip->mpi_mci_link.mcb_objsize = 0;
+	mpip->mpi_mi_link.mcb_objp = NULL;
+	mpip->mpi_mi_link.mcb_objsize = 0;
+
+	ASSERT(mpip->mpi_mci_link.mcb_flags == 0);
+	mpip->mpi_mci_link.mcb_objsize = 0;
+}
+
+void
+mac_client_init(void)
+{
+	ASSERT(mac_tx_percpu_cnt >= 0);
+
+	mac_client_impl_cache = kmem_cache_create("mac_client_impl_cache",
+	    MAC_CLIENT_IMPL_SIZE, 0, i_mac_client_impl_ctor,
+	    i_mac_client_impl_dtor, NULL, NULL, NULL, 0);
+	ASSERT(mac_client_impl_cache != NULL);
+
+	mac_promisc_impl_cache = kmem_cache_create("mac_promisc_impl_cache",
+	    sizeof (mac_promisc_impl_t), 0, i_mac_promisc_impl_ctor,
+	    i_mac_promisc_impl_dtor, NULL, NULL, NULL, 0);
+	ASSERT(mac_promisc_impl_cache != NULL);
+}
+
+void
+mac_client_fini(void)
+{
+	kmem_cache_destroy(mac_client_impl_cache);
+	kmem_cache_destroy(mac_promisc_impl_cache);
+}
+
+/*
+ * Return the lower MAC client handle from the VNIC driver for the
+ * specified VNIC MAC instance.
+ */
+mac_client_impl_t *
+mac_vnic_lower(mac_impl_t *mip)
+{
+	mac_capab_vnic_t cap;
+	mac_client_impl_t *mcip;
+
+	VERIFY(i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, &cap));
+	mcip = cap.mcv_mac_client_handle(cap.mcv_arg);
+
+	return (mcip);
+}
+
+/*
+ * Return the MAC client handle of the primary MAC client for the
+ * specified MAC instance, or NULL otherwise.
+ */
+mac_client_impl_t *
+mac_primary_client_handle(mac_impl_t *mip)
+{
+	mac_client_impl_t *mcip;
+
+	if (mip->mi_state_flags & MIS_IS_VNIC)
+		return (mac_vnic_lower(mip));
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	for (mcip = mip->mi_clients_list; mcip != NULL;
+	    mcip = mcip->mci_client_next) {
+		if (MCIP_DATAPATH_SETUP(mcip) && mac_is_primary_client(mcip))
+			return (mcip);
+	}
+	return (NULL);
+}
+
+/*
+ * Open a MAC specified by its MAC name.
+ */
+int
+mac_open(const char *macname, mac_handle_t *mhp)
+{
+	mac_impl_t	*mip;
+	int		err;
+
+	/*
+	 * Look up its entry in the global hash table.
+	 */
+	if ((err = mac_hold(macname, &mip)) != 0)
+		return (err);
+
+	/*
+	 * Hold the dip associated to the MAC to prevent it from being
+	 * detached. For a softmac, its underlying dip is held by the
+	 * mi_open() callback.
+	 *
+	 * This is done to be more tolerant with some defective drivers,
+	 * which incorrectly handle mac_unregister() failure in their
+	 * xxx_detach() routine. For example, some drivers ignore the
+	 * failure of mac_unregister() and free all resources that
+	 * that are needed for data transmition.
+	 */
+	e_ddi_hold_devi(mip->mi_dip);
+
+	if (!(mip->mi_callbacks->mc_callbacks & MC_OPEN)) {
+		*mhp = (mac_handle_t)mip;
+		return (0);
+	}
+
+	/*
+	 * The mac perimeter is used in both mac_open and mac_close by the
+	 * framework to single thread the MC_OPEN/MC_CLOSE of drivers.
+	 */
+	i_mac_perim_enter(mip);
+	mip->mi_oref++;
+	if (mip->mi_oref != 1 || ((err = mip->mi_open(mip->mi_driver)) == 0)) {
+		*mhp = (mac_handle_t)mip;
+		i_mac_perim_exit(mip);
+		return (0);
+	}
+	mip->mi_oref--;
+	ddi_release_devi(mip->mi_dip);
+	mac_rele(mip);
+	i_mac_perim_exit(mip);
+	return (err);
+}
+
+/*
+ * Open a MAC specified by its linkid.
+ */
+int
+mac_open_by_linkid(datalink_id_t linkid, mac_handle_t *mhp)
+{
+	dls_dl_handle_t	dlh;
+	int		err;
+
+	if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0)
+		return (err);
+
+	dls_devnet_prop_task_wait(dlh);
+
+	err = mac_open(dls_devnet_mac(dlh), mhp);
+
+	dls_devnet_rele_tmp(dlh);
+	return (err);
+}
+
+/*
+ * Open a MAC specified by its link name.
+ */
+int
+mac_open_by_linkname(const char *link, mac_handle_t *mhp)
+{
+	datalink_id_t	linkid;
+	int		err;
+
+	if ((err = dls_mgmt_get_linkid(link, &linkid)) != 0)
+		return (err);
+	return (mac_open_by_linkid(linkid, mhp));
+}
+
+/*
+ * Close the specified MAC.
+ */
+void
+mac_close(mac_handle_t mh)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	i_mac_perim_enter(mip);
+	/*
+	 * The mac perimeter is used in both mac_open and mac_close by the
+	 * framework to single thread the MC_OPEN/MC_CLOSE of drivers.
+	 */
+	if (mip->mi_callbacks->mc_callbacks & MC_OPEN) {
+		ASSERT(mip->mi_oref != 0);
+		if (--mip->mi_oref == 0) {
+			if ((mip->mi_callbacks->mc_callbacks & MC_CLOSE))
+				mip->mi_close(mip->mi_driver);
+		}
+	}
+	i_mac_perim_exit(mip);
+	ddi_release_devi(mip->mi_dip);
+	mac_rele(mip);
+}
+
+/*
+ * Misc utility functions to retrieve various information about a MAC
+ * instance or a MAC client.
+ */
+
+const mac_info_t *
+mac_info(mac_handle_t mh)
+{
+	return (&((mac_impl_t *)mh)->mi_info);
+}
+
+dev_info_t *
+mac_devinfo_get(mac_handle_t mh)
+{
+	return (((mac_impl_t *)mh)->mi_dip);
+}
+
+const char *
+mac_name(mac_handle_t mh)
+{
+	return (((mac_impl_t *)mh)->mi_name);
+}
+
+char *
+mac_client_name(mac_client_handle_t mch)
+{
+	return (((mac_client_impl_t *)mch)->mci_name);
+}
+
+minor_t
+mac_minor(mac_handle_t mh)
+{
+	return (((mac_impl_t *)mh)->mi_minor);
+}
+
+/*
+ * Return the VID associated with a MAC client. This function should
+ * be called for clients which are associated with only one VID.
+ */
+uint16_t
+mac_client_vid(mac_client_handle_t mch)
+{
+	uint16_t		vid = VLAN_ID_NONE;
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	flow_desc_t		flow_desc;
+
+	if (mcip->mci_nflents == 0)
+		return (vid);
+
+	ASSERT(MCIP_DATAPATH_SETUP(mcip) && mac_client_single_rcvr(mcip));
+
+	mac_flow_get_desc(mcip->mci_flent, &flow_desc);
+	if ((flow_desc.fd_mask & FLOW_LINK_VID) != 0)
+		vid = flow_desc.fd_vid;
+
+	return (vid);
+}
+
+/*
+ * Return the link speed associated with the specified MAC client.
+ *
+ * The link speed of a MAC client is equal to the smallest value of
+ * 1) the current link speed of the underlying NIC, or
+ * 2) the bandwidth limit set for the MAC client.
+ *
+ * Note that the bandwidth limit can be higher than the speed
+ * of the underlying NIC. This is allowed to avoid spurious
+ * administration action failures or artifically lowering the
+ * bandwidth limit of a link that may  have temporarily lowered
+ * its link speed due to hardware problem or administrator action.
+ */
+static uint64_t
+mac_client_ifspeed(mac_client_impl_t *mcip)
+{
+	mac_impl_t *mip = mcip->mci_mip;
+	uint64_t nic_speed;
+
+	nic_speed = mac_stat_get((mac_handle_t)mip, MAC_STAT_IFSPEED);
+
+	if (nic_speed == 0) {
+		return (0);
+	} else {
+		uint64_t policy_limit = (uint64_t)-1;
+
+		if (MCIP_RESOURCE_PROPS_MASK(mcip) & MRP_MAXBW)
+			policy_limit = MCIP_RESOURCE_PROPS_MAXBW(mcip);
+
+		return (MIN(policy_limit, nic_speed));
+	}
+}
+
+/*
+ * Return the link state of the specified client. If here are more
+ * than one clients of the underying mac_impl_t, the link state
+ * will always be UP regardless of the link state of the underlying
+ * mac_impl_t. This is needed to allow the MAC clients to continue
+ * to communicate with each other even when the physical link of
+ * their mac_impl_t is down.
+ */
+static uint64_t
+mac_client_link_state(mac_client_impl_t *mcip)
+{
+	mac_impl_t *mip = mcip->mci_mip;
+	uint16_t vid;
+	mac_client_impl_t *mci_list;
+	mac_unicast_impl_t *mui_list, *oth_mui_list;
+
+	/*
+	 * Returns LINK_STATE_UP if there are other MAC clients defined on
+	 * mac_impl_t which share same VLAN ID as that of mcip. Note that
+	 * if 'mcip' has more than one VID's then we match ANY one of the
+	 * VID's with other MAC client's VID's and return LINK_STATE_UP.
+	 */
+	rw_enter(&mcip->mci_rw_lock, RW_READER);
+	for (mui_list = mcip->mci_unicast_list; mui_list != NULL;
+	    mui_list = mui_list->mui_next) {
+		vid = mui_list->mui_vid;
+		for (mci_list = mip->mi_clients_list; mci_list != NULL;
+		    mci_list = mci_list->mci_client_next) {
+			if (mci_list == mcip)
+				continue;
+			for (oth_mui_list = mci_list->mci_unicast_list;
+			    oth_mui_list != NULL; oth_mui_list = oth_mui_list->
+			    mui_next) {
+				if (vid == oth_mui_list->mui_vid) {
+					rw_exit(&mcip->mci_rw_lock);
+					return (LINK_STATE_UP);
+				}
+			}
+		}
+	}
+	rw_exit(&mcip->mci_rw_lock);
+
+	return (mac_stat_get((mac_handle_t)mip, MAC_STAT_LINK_STATE));
+}
+
+/*
+ * Return the statistics of a MAC client. These statistics are different
+ * then the statistics of the underlying MAC which are returned by
+ * mac_stat_get().
+ */
+uint64_t
+mac_client_stat_get(mac_client_handle_t mch, uint_t stat)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_impl_t *mip = mcip->mci_mip;
+	uint64_t val;
+
+	switch (stat) {
+	case MAC_STAT_LINK_STATE:
+		val = mac_client_link_state(mcip);
+		break;
+	case MAC_STAT_LINK_UP:
+		val = (mac_client_link_state(mcip) == LINK_STATE_UP);
+		break;
+	case MAC_STAT_PROMISC:
+		val = mac_stat_get((mac_handle_t)mip, MAC_STAT_PROMISC);
+		break;
+	case MAC_STAT_IFSPEED:
+		val = mac_client_ifspeed(mcip);
+		break;
+	case MAC_STAT_MULTIRCV:
+		val = mcip->mci_stat_multircv;
+		break;
+	case MAC_STAT_BRDCSTRCV:
+		val = mcip->mci_stat_brdcstrcv;
+		break;
+	case MAC_STAT_MULTIXMT:
+		val = mcip->mci_stat_multixmt;
+		break;
+	case MAC_STAT_BRDCSTXMT:
+		val = mcip->mci_stat_brdcstxmt;
+		break;
+	case MAC_STAT_OBYTES:
+		val = mcip->mci_stat_obytes;
+		break;
+	case MAC_STAT_OPACKETS:
+		val = mcip->mci_stat_opackets;
+		break;
+	case MAC_STAT_OERRORS:
+		val = mcip->mci_stat_oerrors;
+		break;
+	case MAC_STAT_IPACKETS:
+		val = mcip->mci_stat_ipackets;
+		break;
+	case MAC_STAT_RBYTES:
+		val = mcip->mci_stat_ibytes;
+		break;
+	case MAC_STAT_IERRORS:
+		val = mcip->mci_stat_ierrors;
+		break;
+	default:
+		val = mac_stat_default(mip, stat);
+		break;
+	}
+
+	return (val);
+}
+
+/*
+ * Return the statistics of the specified MAC instance.
+ */
+uint64_t
+mac_stat_get(mac_handle_t mh, uint_t stat)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+	uint64_t	val;
+	int		ret;
+
+	/*
+	 * The range of stat determines where it is maintained.  Stat
+	 * values from 0 up to (but not including) MAC_STAT_MIN are
+	 * mainteined by the mac module itself.  Everything else is
+	 * maintained by the driver.
+	 *
+	 * If the mac_impl_t being queried corresponds to a VNIC,
+	 * the stats need to be queried from the lower MAC client
+	 * corresponding to the VNIC. (The mac_link_update()
+	 * invoked by the driver to the lower MAC causes the *lower
+	 * MAC* to update its mi_linkstate, and send a notification
+	 * to its MAC clients. Due to the VNIC passthrough,
+	 * these notifications are sent to the upper MAC clients
+	 * of the VNIC directly, and the upper mac_impl_t of the VNIC
+	 * does not have a valid mi_linkstate.
+	 */
+	if (stat < MAC_STAT_MIN && !(mip->mi_state_flags & MIS_IS_VNIC)) {
+		/* these stats are maintained by the mac module itself */
+		switch (stat) {
+		case MAC_STAT_LINK_STATE:
+			return (mip->mi_linkstate);
+		case MAC_STAT_LINK_UP:
+			return (mip->mi_linkstate == LINK_STATE_UP);
+		case MAC_STAT_PROMISC:
+			return (mip->mi_devpromisc != 0);
+		default:
+			ASSERT(B_FALSE);
+		}
+	}
+
+	/*
+	 * Call the driver to get the given statistic.
+	 */
+	ret = mip->mi_getstat(mip->mi_driver, stat, &val);
+	if (ret != 0) {
+		/*
+		 * The driver doesn't support this statistic.  Get the
+		 * statistic's default value.
+		 */
+		val = mac_stat_default(mip, stat);
+	}
+	return (val);
+}
+
+/*
+ * Utility function which returns the VID associated with a flow entry.
+ */
+uint16_t
+i_mac_flow_vid(flow_entry_t *flent)
+{
+	flow_desc_t	flow_desc;
+
+	mac_flow_get_desc(flent, &flow_desc);
+
+	if ((flow_desc.fd_mask & FLOW_LINK_VID) != 0)
+		return (flow_desc.fd_vid);
+	return (VLAN_ID_NONE);
+}
+
+/*
+ * Verify the validity of the specified unicast MAC address. Returns B_TRUE
+ * if the address is valid, B_FALSE otherwise (multicast address, or incorrect
+ * length.
+ */
+boolean_t
+mac_unicst_verify(mac_handle_t mh, const uint8_t *addr, uint_t len)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	/*
+	 * Verify the address. No lock is needed since mi_type and plugin
+	 * details don't change after mac_register().
+	 */
+	if ((len != mip->mi_type->mt_addr_length) ||
+	    (mip->mi_type->mt_ops.mtops_unicst_verify(addr,
+	    mip->mi_pdata)) != 0) {
+		return (B_FALSE);
+	} else {
+		return (B_TRUE);
+	}
+}
+
+void
+mac_sdu_get(mac_handle_t mh, uint_t *min_sdu, uint_t *max_sdu)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	if (min_sdu != NULL)
+		*min_sdu = mip->mi_sdu_min;
+	if (max_sdu != NULL)
+		*max_sdu = mip->mi_sdu_max;
+}
+
+/*
+ * Update the MAC unicast address of the specified client's flows. Currently
+ * only one unicast MAC unicast address is allowed per client.
+ */
+static void
+mac_unicast_update_client_flow(mac_client_impl_t *mcip)
+{
+	mac_impl_t *mip = mcip->mci_mip;
+	flow_entry_t *flent = mcip->mci_flent;
+	mac_address_t *map = mcip->mci_unicast;
+	flow_desc_t flow_desc;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+	ASSERT(flent != NULL);
+
+	mac_flow_get_desc(flent, &flow_desc);
+	ASSERT(flow_desc.fd_mask & FLOW_LINK_DST);
+
+	bcopy(map->ma_addr, flow_desc.fd_dst_mac, map->ma_len);
+	mac_flow_set_desc(flent, &flow_desc);
+
+	/*
+	 * A MAC client could have one MAC address but multiple
+	 * VLANs. In that case update the flow entries corresponding
+	 * to all VLANs of the MAC client.
+	 */
+	for (flent = mcip->mci_flent_list; flent != NULL;
+	    flent = flent->fe_client_next) {
+		mac_flow_get_desc(flent, &flow_desc);
+		if (!(flent->fe_type & FLOW_PRIMARY_MAC ||
+		    flent->fe_type & FLOW_VNIC_MAC))
+			continue;
+
+		bcopy(map->ma_addr, flow_desc.fd_dst_mac, map->ma_len);
+		mac_flow_set_desc(flent, &flow_desc);
+	}
+}
+
+/*
+ * Update all clients that share the same unicast address.
+ */
+void
+mac_unicast_update_clients(mac_impl_t *mip, mac_address_t *map)
+{
+	mac_client_impl_t *mcip;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	/*
+	 * Find all clients that share the same unicast MAC address and update
+	 * them appropriately.
+	 */
+	for (mcip = mip->mi_clients_list; mcip != NULL;
+	    mcip = mcip->mci_client_next) {
+		/*
+		 * Ignore clients that don't share this MAC address.
+		 */
+		if (map != mcip->mci_unicast)
+			continue;
+
+		/*
+		 * Update those clients with same old unicast MAC address.
+		 */
+		mac_unicast_update_client_flow(mcip);
+	}
+}
+
+/*
+ * Update the unicast MAC address of the specified VNIC MAC client.
+ *
+ * Check whether the operation is valid. Any of following cases should fail:
+ *
+ * 1. It's a VLAN type of VNIC.
+ * 2. The new value is current "primary" MAC address.
+ * 3. The current MAC address is shared with other clients.
+ * 4. The new MAC address has been used. This case will be valid when
+ *    client migration is fully supported.
+ */
+int
+mac_vnic_unicast_set(mac_client_handle_t mch, const uint8_t *addr)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_impl_t *mip = mcip->mci_mip;
+	mac_address_t *map = mcip->mci_unicast;
+	int err;
+
+	ASSERT(!(mip->mi_state_flags & MIS_IS_VNIC));
+	ASSERT(mcip->mci_state_flags & MCIS_IS_VNIC);
+	ASSERT(mcip->mci_flags != MAC_CLIENT_FLAGS_PRIMARY);
+
+	i_mac_perim_enter(mip);
+
+	/*
+	 * If this is a VLAN type of VNIC, it's using "primary" MAC address
+	 * of the underlying interface. Must fail here. Refer to case 1 above.
+	 */
+	if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0) {
+		i_mac_perim_exit(mip);
+		return (ENOTSUP);
+	}
+
+	/*
+	 * If the new address is the "primary" one, must fail. Refer to
+	 * case 2 above.
+	 */
+	if (bcmp(addr, mip->mi_addr, map->ma_len) == 0) {
+		i_mac_perim_exit(mip);
+		return (EACCES);
+	}
+
+	/*
+	 * If the address is shared by multiple clients, must fail. Refer
+	 * to case 3 above.
+	 */
+	if (mac_check_macaddr_shared(map)) {
+		i_mac_perim_exit(mip);
+		return (EBUSY);
+	}
+
+	/*
+	 * If the new address has been used, must fail for now. Refer to
+	 * case 4 above.
+	 */
+	if (mac_find_macaddr(mip, (uint8_t *)addr) != NULL) {
+		i_mac_perim_exit(mip);
+		return (ENOTSUP);
+	}
+
+	/*
+	 * Update the MAC address.
+	 */
+	err = mac_update_macaddr(map, (uint8_t *)addr);
+
+	if (err != 0) {
+		i_mac_perim_exit(mip);
+		return (err);
+	}
+
+	/*
+	 * Update all flows of this MAC client.
+	 */
+	mac_unicast_update_client_flow(mcip);
+
+	i_mac_perim_exit(mip);
+	return (0);
+}
+
+/*
+ * Program the new primary unicast address of the specified MAC.
+ *
+ * Function mac_update_macaddr() takes care different types of underlying
+ * MAC. If the underlying MAC is VNIC, the VNIC driver must have registerd
+ * mi_unicst() entry point, that indirectly calls mac_vnic_unicast_set()
+ * which will take care of updating the MAC address of the corresponding
+ * MAC client.
+ *
+ * This is the only interface that allow the client to update the "primary"
+ * MAC address of the underlying MAC. The new value must have not been
+ * used by other clients.
+ */
+int
+mac_unicast_primary_set(mac_handle_t mh, const uint8_t *addr)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+	mac_address_t *map;
+	int err;
+
+	/* verify the address validity */
+	if (!mac_unicst_verify(mh, addr, mip->mi_type->mt_addr_length))
+		return (EINVAL);
+
+	i_mac_perim_enter(mip);
+
+	/*
+	 * If the new value is the same as the current primary address value,
+	 * there's nothing to do.
+	 */
+	if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) == 0) {
+		i_mac_perim_exit(mip);
+		return (0);
+	}
+
+	if (mac_find_macaddr(mip, (uint8_t *)addr) != 0) {
+		i_mac_perim_exit(mip);
+		return (EBUSY);
+	}
+
+	map = mac_find_macaddr(mip, mip->mi_addr);
+	ASSERT(map != NULL);
+
+	/*
+	 * Update the MAC address.
+	 */
+	if (mip->mi_state_flags & MIS_IS_AGGR) {
+		mac_capab_aggr_t aggr_cap;
+
+		/*
+		 * If the mac is an aggregation, other than the unicast
+		 * addresses programming, aggr must be informed about this
+		 * primary unicst address change to change its mac address
+		 * policy to be user-specified.
+		 */
+		ASSERT(map->ma_type == MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED);
+		VERIFY(i_mac_capab_get(mh, MAC_CAPAB_AGGR, &aggr_cap));
+		err = aggr_cap.mca_unicst(mip->mi_driver, addr);
+		if (err == 0)
+			bcopy(addr, map->ma_addr, map->ma_len);
+	} else {
+		err = mac_update_macaddr(map, (uint8_t *)addr);
+	}
+
+	if (err != 0) {
+		i_mac_perim_exit(mip);
+		return (err);
+	}
+
+	mac_unicast_update_clients(mip, map);
+
+	/*
+	 * Save the new primary MAC address in mac_impl_t.
+	 */
+	bcopy(addr, mip->mi_addr, mip->mi_type->mt_addr_length);
+
+	i_mac_perim_exit(mip);
+
+	if (err == 0)
+		i_mac_notify(mip, MAC_NOTE_UNICST);
+
+	return (err);
+}
+
+/*
+ * Return the current primary MAC address of the specified MAC.
+ */
+void
+mac_unicast_primary_get(mac_handle_t mh, uint8_t *addr)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	rw_enter(&mip->mi_rw_lock, RW_READER);
+	bcopy(mip->mi_addr, addr, mip->mi_type->mt_addr_length);
+	rw_exit(&mip->mi_rw_lock);
+}
+
+/*
+ * Return information about the use of the primary MAC address of the
+ * specified MAC instance:
+ *
+ * - if client_name is non-NULL, it must point to a string of at
+ *   least MAXNAMELEN bytes, and will be set to the name of the MAC
+ *   client which uses the primary MAC address.
+ *
+ * - if in_use is non-NULL, used to return whether the primary MAC
+ *   address is currently in use.
+ */
+void
+mac_unicast_primary_info(mac_handle_t mh, char *client_name, boolean_t *in_use)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+	mac_client_impl_t *cur_client;
+
+	if (in_use != NULL)
+		*in_use = B_FALSE;
+	if (client_name != NULL)
+		bzero(client_name, MAXNAMELEN);
+
+	/*
+	 * The mi_rw_lock is used to protect threads that don't hold the
+	 * mac perimeter to get a consistent view of the mi_clients_list.
+	 * Threads that modify the list must hold both the mac perimeter and
+	 * mi_rw_lock(RW_WRITER)
+	 */
+	rw_enter(&mip->mi_rw_lock, RW_READER);
+	for (cur_client = mip->mi_clients_list; cur_client != NULL;
+	    cur_client = cur_client->mci_client_next) {
+		if (mac_is_primary_client(cur_client) ||
+		    (mip->mi_state_flags & MIS_IS_VNIC)) {
+			rw_exit(&mip->mi_rw_lock);
+			if (in_use != NULL)
+				*in_use = B_TRUE;
+			if (client_name != NULL) {
+				bcopy(cur_client->mci_name, client_name,
+				    MAXNAMELEN);
+			}
+			return;
+		}
+	}
+	rw_exit(&mip->mi_rw_lock);
+}
+
+/*
+ * Add the specified MAC client to the list of clients which opened
+ * the specified MAC.
+ */
+static void
+mac_client_add(mac_client_impl_t *mcip)
+{
+	mac_impl_t *mip = mcip->mci_mip;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	/* add VNIC to the front of the list */
+	rw_enter(&mip->mi_rw_lock, RW_WRITER);
+	mcip->mci_client_next = mip->mi_clients_list;
+	mip->mi_clients_list = mcip;
+	mip->mi_nclients++;
+	rw_exit(&mip->mi_rw_lock);
+}
+
+/*
+ * Remove the specified MAC client from the list of clients which opened
+ * the specified MAC.
+ */
+static void
+mac_client_remove(mac_client_impl_t *mcip)
+{
+	mac_impl_t *mip = mcip->mci_mip;
+	mac_client_impl_t **prev, *cclient;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	rw_enter(&mip->mi_rw_lock, RW_WRITER);
+	prev = &mip->mi_clients_list;
+	cclient = *prev;
+	while (cclient != NULL && cclient != mcip) {
+		prev = &cclient->mci_client_next;
+		cclient = *prev;
+	}
+	ASSERT(cclient != NULL);
+	*prev = cclient->mci_client_next;
+	mip->mi_nclients--;
+	rw_exit(&mip->mi_rw_lock);
+}
+
+static mac_unicast_impl_t *
+mac_client_find_vid(mac_client_impl_t *mcip, uint16_t vid)
+{
+	mac_unicast_impl_t *muip = mcip->mci_unicast_list;
+
+	while ((muip != NULL) && (muip->mui_vid != vid))
+		muip = muip->mui_next;
+
+	return (muip);
+}
+
+/*
+ * Return whether the specified (MAC address, VID) tuple is already used by
+ * one of the MAC clients associated with the specified MAC.
+ */
+static boolean_t
+mac_addr_in_use(mac_impl_t *mip, uint8_t *mac_addr, uint16_t vid)
+{
+	mac_client_impl_t *client;
+	mac_address_t *map;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	for (client = mip->mi_clients_list; client != NULL;
+	    client = client->mci_client_next) {
+
+		/*
+		 * Ignore clients that don't have unicast address.
+		 */
+		if (client->mci_unicast_list == NULL)
+			continue;
+
+		map = client->mci_unicast;
+
+		if ((bcmp(mac_addr, map->ma_addr, map->ma_len) == 0) &&
+		    (mac_client_find_vid(client, vid) != NULL)) {
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Generate a random MAC address. The MAC address prefix is
+ * stored in the array pointed to by mac_addr, and its length, in bytes,
+ * is specified by prefix_len. The least significant bits
+ * after prefix_len bytes are generated, and stored after the prefix
+ * in the mac_addr array.
+ */
+int
+mac_addr_random(mac_client_handle_t mch, uint_t prefix_len,
+    uint8_t *mac_addr, mac_diag_t *diag)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_impl_t *mip = mcip->mci_mip;
+	size_t addr_len = mip->mi_type->mt_addr_length;
+
+	if (prefix_len >= addr_len) {
+		*diag = MAC_DIAG_MACPREFIXLEN_INVALID;
+		return (EINVAL);
+	}
+
+	/* check the prefix value */
+	if (prefix_len > 0) {
+		bzero(mac_addr + prefix_len, addr_len - prefix_len);
+		if (!mac_unicst_verify((mac_handle_t)mip, mac_addr,
+		    addr_len)) {
+			*diag = MAC_DIAG_MACPREFIX_INVALID;
+			return (EINVAL);
+		}
+	}
+
+	/* generate the MAC address */
+	if (prefix_len < addr_len) {
+		(void) random_get_pseudo_bytes(mac_addr +
+		    prefix_len, addr_len - prefix_len);
+	}
+
+	*diag = 0;
+	return (0);
+}
+
+/*
+ * Set the priority range for this MAC client. This will be used to
+ * determine the absolute priority for the threads created for this
+ * MAC client using the specified "low", "medium" and "high" level.
+ * This will also be used for any subflows on this MAC client.
+ */
+#define	MAC_CLIENT_SET_PRIORITY_RANGE(mcip, pri) {			\
+	(mcip)->mci_min_pri = FLOW_MIN_PRIORITY(MINCLSYSPRI,	\
+	    MAXCLSYSPRI, (pri));					\
+	(mcip)->mci_max_pri = FLOW_MAX_PRIORITY(MINCLSYSPRI,	\
+	    MAXCLSYSPRI, (mcip)->mci_min_pri);				\
+	}
+
+/*
+ * MAC client open entry point. Return a new MAC client handle. Each
+ * MAC client is associated with a name, specified through the 'name'
+ * argument.
+ */
+int
+mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name,
+    uint16_t flags)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+	mac_client_impl_t *mcip;
+	int err = 0;
+	boolean_t share_desired =
+	    ((flags & MAC_OPEN_FLAGS_SHARES_DESIRED) != 0);
+	boolean_t no_hwrings = ((flags & MAC_OPEN_FLAGS_NO_HWRINGS) != 0);
+	boolean_t req_hwrings = ((flags & MAC_OPEN_FLAGS_REQ_HWRINGS) != 0);
+	flow_entry_t	*flent = NULL;
+
+	*mchp = NULL;
+	if (share_desired && no_hwrings) {
+		/* can't have shares but no hardware rings */
+		return (EINVAL);
+	}
+
+	i_mac_perim_enter(mip);
+
+	if (mip->mi_state_flags & MIS_IS_VNIC) {
+		/*
+		 * The underlying MAC is a VNIC. Return the MAC client
+		 * handle of the lower MAC which was obtained by
+		 * the VNIC driver when it did its mac_client_open().
+		 */
+
+		mcip = mac_vnic_lower(mip);
+		/*
+		 * If there are multiple MAC clients of the VNIC, they
+		 * all share the same underlying MAC client handle.
+		 */
+		if ((flags & MAC_OPEN_FLAGS_TAG_DISABLE) != 0)
+			mcip->mci_state_flags |= MCIS_TAG_DISABLE;
+
+		if ((flags & MAC_OPEN_FLAGS_STRIP_DISABLE) != 0)
+			mcip->mci_state_flags |= MCIS_STRIP_DISABLE;
+
+		if ((flags & MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK) != 0)
+			mcip->mci_state_flags |= MCIS_DISABLE_TX_VID_CHECK;
+
+		/*
+		 * Note that multiple mac clients share the same mcip in
+		 * this case.
+		 */
+		if (flags & MAC_OPEN_FLAGS_EXCLUSIVE)
+			mcip->mci_state_flags |= MCIS_EXCLUSIVE;
+
+		mip->mi_clients_list = mcip;
+		i_mac_perim_exit(mip);
+		*mchp = (mac_client_handle_t)mcip;
+		return (err);
+	}
+
+	mcip = kmem_cache_alloc(mac_client_impl_cache, KM_SLEEP);
+
+	mcip->mci_mip = mip;
+	mcip->mci_upper_mip = NULL;
+	mcip->mci_rx_fn = mac_pkt_drop;
+	mcip->mci_rx_arg = NULL;
+	mcip->mci_direct_rx_fn = NULL;
+	mcip->mci_direct_rx_arg = NULL;
+
+	if ((flags & MAC_OPEN_FLAGS_IS_VNIC) != 0)
+		mcip->mci_state_flags |= MCIS_IS_VNIC;
+
+	if ((flags & MAC_OPEN_FLAGS_EXCLUSIVE) != 0)
+		mcip->mci_state_flags |= MCIS_EXCLUSIVE;
+
+	if ((flags & MAC_OPEN_FLAGS_IS_AGGR_PORT) != 0)
+		mcip->mci_state_flags |= MCIS_IS_AGGR_PORT;
+
+	if ((flags & MAC_OPEN_FLAGS_TAG_DISABLE) != 0)
+		mcip->mci_state_flags |= MCIS_TAG_DISABLE;
+
+	if ((flags & MAC_OPEN_FLAGS_STRIP_DISABLE) != 0)
+		mcip->mci_state_flags |= MCIS_STRIP_DISABLE;
+
+	if ((flags & MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK) != 0)
+		mcip->mci_state_flags |= MCIS_DISABLE_TX_VID_CHECK;
+
+	if ((flags & MAC_OPEN_FLAGS_USE_DATALINK_NAME) != 0) {
+		datalink_id_t	linkid;
+
+		ASSERT(name == NULL);
+		if ((err = dls_devnet_macname2linkid(mip->mi_name,
+		    &linkid)) != 0) {
+			goto done;
+		}
+		if ((err = dls_mgmt_get_linkinfo(linkid, mcip->mci_name, NULL,
+		    NULL, NULL)) != 0) {
+			/*
+			 * Use mac name if dlmgmtd is not available.
+			 */
+			if (err == EBADF) {
+				(void) strlcpy(mcip->mci_name, mip->mi_name,
+				    sizeof (mcip->mci_name));
+				err = 0;
+			} else {
+				goto done;
+			}
+		}
+		mcip->mci_state_flags |= MCIS_USE_DATALINK_NAME;
+	} else {
+		ASSERT(name != NULL);
+		if (strlen(name) > MAXNAMELEN) {
+			err = EINVAL;
+			goto done;
+		}
+		(void) strlcpy(mcip->mci_name, name, sizeof (mcip->mci_name));
+	}
+	/* the subflow table will be created dynamically */
+	mcip->mci_subflow_tab = NULL;
+	mcip->mci_stat_multircv = 0;
+	mcip->mci_stat_brdcstrcv = 0;
+	mcip->mci_stat_multixmt = 0;
+	mcip->mci_stat_brdcstxmt = 0;
+
+	mcip->mci_stat_obytes = 0;
+	mcip->mci_stat_opackets = 0;
+	mcip->mci_stat_oerrors = 0;
+	mcip->mci_stat_ibytes = 0;
+	mcip->mci_stat_ipackets = 0;
+	mcip->mci_stat_ierrors = 0;
+
+	/* Create an initial flow */
+
+	err = mac_flow_create(NULL, NULL, mcip->mci_name, NULL,
+	    mcip->mci_state_flags & MCIS_IS_VNIC ? FLOW_VNIC_MAC :
+	    FLOW_PRIMARY_MAC, &flent);
+	if (err != 0)
+		goto done;
+	mcip->mci_flent = flent;
+	FLOW_MARK(flent, FE_MC_NO_DATAPATH);
+	flent->fe_mcip = mcip;
+	/*
+	 * Place initial creation reference on the flow. This reference
+	 * is released in the corresponding delete action viz.
+	 * mac_unicast_remove after waiting for all transient refs to
+	 * to go away. The wait happens in mac_flow_wait.
+	 */
+	FLOW_REFHOLD(flent);
+
+	/*
+	 * Do this ahead of the mac_bcast_add() below so that the mi_nclients
+	 * will have the right value for mac_rx_srs_setup().
+	 */
+	mac_client_add(mcip);
+
+	mcip->mci_no_hwrings = no_hwrings;
+	mcip->mci_req_hwrings = req_hwrings;
+	mcip->mci_share = NULL;
+	if (share_desired) {
+		ASSERT(!no_hwrings);
+		i_mac_share_alloc(mcip);
+	}
+
+	DTRACE_PROBE2(mac__client__open__allocated, mac_impl_t *,
+	    mcip->mci_mip, mac_client_impl_t *, mcip);
+	*mchp = (mac_client_handle_t)mcip;
+
+	i_mac_perim_exit(mip);
+	return (0);
+
+done:
+	i_mac_perim_exit(mip);
+	mcip->mci_state_flags = 0;
+	mcip->mci_tx_flag = 0;
+	kmem_cache_free(mac_client_impl_cache, mcip);
+	return (err);
+}
+
+/*
+ * Close the specified MAC client handle.
+ */
+void
+mac_client_close(mac_client_handle_t mch, uint16_t flags)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_impl_t		*mip = mcip->mci_mip;
+	flow_entry_t		*flent;
+
+	i_mac_perim_enter(mip);
+
+	if (flags & MAC_CLOSE_FLAGS_EXCLUSIVE)
+		mcip->mci_state_flags &= ~MCIS_EXCLUSIVE;
+
+	if ((mcip->mci_state_flags & MCIS_IS_VNIC) &&
+	    !(flags & MAC_CLOSE_FLAGS_IS_VNIC)) {
+		/*
+		 * This is an upper VNIC client initiated operation.
+		 * The lower MAC client will be closed by the VNIC driver
+		 * when the VNIC is deleted.
+		 */
+
+		/*
+		 * Clear the flags set when the upper client initiated
+		 * open.
+		 */
+		mcip->mci_state_flags &= ~(MCIS_TAG_DISABLE |
+		    MCIS_STRIP_DISABLE | MCIS_DISABLE_TX_VID_CHECK);
+
+		i_mac_perim_exit(mip);
+		return;
+	}
+
+	/*
+	 * Remove the flent associated with the MAC client
+	 */
+	flent = mcip->mci_flent;
+	mcip->mci_flent = NULL;
+	FLOW_FINAL_REFRELE(flent);
+
+	/*
+	 * MAC clients must remove the unicast addresses and promisc callbacks
+	 * they added before issuing a mac_client_close().
+	 */
+	ASSERT(mcip->mci_unicast_list == NULL);
+	ASSERT(mcip->mci_promisc_list == NULL);
+	ASSERT(mcip->mci_tx_notify_cb_list == NULL);
+
+	i_mac_share_free(mcip);
+
+	mac_client_remove(mcip);
+
+	i_mac_perim_exit(mip);
+	mcip->mci_subflow_tab = NULL;
+	mcip->mci_state_flags = 0;
+	mcip->mci_tx_flag = 0;
+	kmem_cache_free(mac_client_impl_cache, mch);
+}
+
+/*
+ * Enable bypass for the specified MAC client.
+ */
+boolean_t
+mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_impl_t		*mip = mcip->mci_mip;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	/*
+	 * If the mac_client is a VLAN or native media is non ethernet, we
+	 * should not do DLS bypass and instead let the packets go via the
+	 * default mac_rx_deliver route so vlan header can be stripped etc.
+	 */
+	if (mcip->mci_nvids > 0 ||
+	    mip->mi_info.mi_nativemedia != DL_ETHER)
+		return (B_FALSE);
+
+	/*
+	 * These are not accessed directly in the data path, and hence
+	 * don't need any protection
+	 */
+	mcip->mci_direct_rx_fn = rx_fn;
+	mcip->mci_direct_rx_arg = arg1;
+	mcip->mci_state_flags |= MCIS_CLIENT_POLL_CAPABLE;
+	return (B_TRUE);
+}
+
+/*
+ * Set the receive callback for the specified MAC client. There can be
+ * at most one such callback per MAC client.
+ */
+void
+mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_impl_t	*mip = mcip->mci_mip;
+
+	/*
+	 * Instead of adding an extra set of locks and refcnts in
+	 * the datapath at the mac client boundary, we temporarily quiesce
+	 * the SRS and related entities. We then change the receive function
+	 * without interference from any receive data thread and then reenable
+	 * the data flow subsequently.
+	 */
+	i_mac_perim_enter(mip);
+	mac_rx_client_quiesce(mch);
+
+	mcip->mci_rx_fn = rx_fn;
+	mcip->mci_rx_arg = arg;
+	mac_rx_client_restart(mch);
+	i_mac_perim_exit(mip);
+}
+
+/*
+ * Reset the receive callback for the specified MAC client.
+ */
+void
+mac_rx_clear(mac_client_handle_t mch)
+{
+	mac_rx_set(mch, mac_pkt_drop, NULL);
+}
+
+/*
+ * Walk the MAC client subflow table and updates their priority values.
+ */
+static int
+mac_update_subflow_priority_cb(flow_entry_t *flent, void *arg)
+{
+	mac_flow_update_priority(arg, flent);
+	return (0);
+}
+
+void
+mac_update_subflow_priority(mac_client_impl_t *mcip)
+{
+	(void) mac_flow_walk(mcip->mci_subflow_tab,
+	    mac_update_subflow_priority_cb, mcip);
+}
+
+/*
+ * When the MAC client is being brought up (i.e. we do a unicast_add) we need
+ * to initialize the cpu and resource control structure in the
+ * mac_client_impl_t from the mac_impl_t (i.e if there are any cached
+ * properties before the flow entry for the unicast address was created).
+ */
+int
+mac_resource_ctl_set(mac_client_handle_t mch, mac_resource_props_t *mrp)
+{
+	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
+	mac_impl_t		*mip = (mac_impl_t *)mcip->mci_mip;
+	int			err = 0;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	err = mac_validate_props(mrp);
+	if (err != 0)
+		return (err);
+
+	mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip), B_FALSE);
+	if (MCIP_DATAPATH_SETUP(mcip)) {
+		/*
+		 * We have to set this prior to calling mac_flow_modify.
+		 */
+		if (mrp->mrp_mask & MRP_PRIORITY) {
+			if (mrp->mrp_priority == MPL_RESET) {
+				MAC_CLIENT_SET_PRIORITY_RANGE(mcip,
+				    MPL_LINK_DEFAULT);
+			} else {
+				MAC_CLIENT_SET_PRIORITY_RANGE(mcip,
+				    mrp->mrp_priority);
+			}
+		}
+
+		mac_flow_modify(mip->mi_flow_tab, mcip->mci_flent, mrp);
+		if (mrp->mrp_mask & MRP_PRIORITY)
+			mac_update_subflow_priority(mcip);
+		return (0);
+	}
+	return (0);
+}
+
+void
+mac_resource_ctl_get(mac_client_handle_t mch, mac_resource_props_t *mrp)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_resource_props_t	*mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
+
+	bcopy(mcip_mrp, mrp, sizeof (mac_resource_props_t));
+}
+
+static int
+mac_unicast_flow_create(mac_client_impl_t *mcip, uint8_t *mac_addr,
+    uint16_t vid, boolean_t is_primary, boolean_t first_flow,
+    flow_entry_t **flent, mac_resource_props_t *mrp)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mcip->mci_mip;
+	flow_desc_t	flow_desc;
+	char		flowname[MAXFLOWNAME];
+	int		err;
+	uint_t		flent_flags;
+
+	/*
+	 * First unicast address being added, create a new flow
+	 * for that MAC client.
+	 */
+	bzero(&flow_desc, sizeof (flow_desc));
+
+	flow_desc.fd_mac_len = mip->mi_type->mt_addr_length;
+	bcopy(mac_addr, flow_desc.fd_dst_mac, flow_desc.fd_mac_len);
+	flow_desc.fd_mask = FLOW_LINK_DST;
+	if (vid != 0) {
+		flow_desc.fd_vid = vid;
+		flow_desc.fd_mask |= FLOW_LINK_VID;
+	}
+
+	/*
+	 * XXX-nicolas. For now I'm keeping the FLOW_PRIMARY_MAC
+	 * and FLOW_VNIC. Even though they're a hack inherited
+	 * from the SRS code, we'll keep them for now. They're currently
+	 * consumed by mac_datapath_setup() to create the SRS.
+	 * That code should be eventually moved out of
+	 * mac_datapath_setup() and moved to a mac_srs_create()
+	 * function of some sort to keep things clean.
+	 *
+	 * Also, there's no reason why the SRS for the primary MAC
+	 * client should be different than any other MAC client. Until
+	 * this is cleaned-up, we support only one MAC unicast address
+	 * per client.
+	 *
+	 * We set FLOW_PRIMARY_MAC for the primary MAC address,
+	 * FLOW_VNIC for everything else.
+	 */
+	if (is_primary)
+		flent_flags = FLOW_PRIMARY_MAC;
+	else
+		flent_flags = FLOW_VNIC_MAC;
+
+	/*
+	 * For the first flow we use the mac client's name - mci_name, for
+	 * subsequent ones we just create a name with the vid. This is
+	 * so that we can add these flows to the same flow table. This is
+	 * fine as the flow name (except for the one with the mac client's
+	 * name) is not visible. When the first flow is removed, we just replace
+	 * its fdesc with another from the list, so we will still retain the
+	 * flent with the MAC client's flow name.
+	 */
+	if (first_flow) {
+		bcopy(mcip->mci_name, flowname, MAXFLOWNAME);
+	} else {
+		(void) sprintf(flowname, "%s%u", mcip->mci_name, vid);
+		flent_flags = FLOW_NO_STATS;
+	}
+
+	if ((err = mac_flow_create(&flow_desc, mrp, flowname, NULL,
+	    flent_flags, flent)) != 0)
+		return (err);
+
+	FLOW_MARK(*flent, FE_INCIPIENT);
+	(*flent)->fe_mcip = mcip;
+
+	/*
+	 * Place initial creation reference on the flow. This reference
+	 * is released in the corresponding delete action viz.
+	 * mac_unicast_remove after waiting for all transient refs to
+	 * to go away. The wait happens in mac_flow_wait.
+	 * We have already held the reference in mac_client_open().
+	 */
+	if (!first_flow)
+		FLOW_REFHOLD(*flent);
+	return (0);
+}
+
+/* Refresh the multicast grouping for this VID. */
+int
+mac_client_update_mcast(void *arg, boolean_t add, const uint8_t *addrp)
+{
+	flow_entry_t		*flent = arg;
+	mac_client_impl_t	*mcip = flent->fe_mcip;
+	uint16_t		vid;
+	flow_desc_t		flow_desc;
+
+	mac_flow_get_desc(flent, &flow_desc);
+	vid = (flow_desc.fd_mask & FLOW_LINK_VID) != 0 ?
+	    flow_desc.fd_vid : VLAN_ID_NONE;
+
+	/*
+	 * We don't call mac_multicast_add()/mac_multicast_remove() as
+	 * we want to add/remove for this specific vid.
+	 */
+	if (add) {
+		return (mac_bcast_add(mcip, addrp, vid,
+		    MAC_ADDRTYPE_MULTICAST));
+	} else {
+		mac_bcast_delete(mcip, addrp, vid);
+		return (0);
+	}
+}
+
+/*
+ * Add a new unicast address to the MAC client.
+ *
+ * The MAC address can be specified either by value, or the MAC client
+ * can specify that it wants to use the primary MAC address of the
+ * underlying MAC. See the introductory comments at the beginning
+ * of this file for more more information on primary MAC addresses.
+ *
+ * Note also the tuple (MAC address, VID) must be unique
+ * for the MAC clients defined on top of the same underlying MAC
+ * instance, unless the MAC_UNICAST_NODUPCHECK is specified.
+ */
+
+int
+i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
+    mac_unicast_handle_t *mah, uint16_t vid, mac_diag_t *diag)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_impl_t *mip = mcip->mci_mip;
+	mac_unicast_impl_t *muip;
+	flow_entry_t *flent;
+	int err;
+	uint_t mac_len = mip->mi_type->mt_addr_length;
+	boolean_t check_dups = !(flags & MAC_UNICAST_NODUPCHECK);
+	boolean_t is_primary = (flags & MAC_UNICAST_PRIMARY);
+	boolean_t is_vnic_primary = flags & MAC_UNICAST_VNIC_PRIMARY;
+	boolean_t bcast_added = B_FALSE;
+	boolean_t nactiveclients_added = B_FALSE;
+	boolean_t mac_started = B_FALSE;
+	mac_resource_props_t mrp;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	/* when VID is non-zero, the underlying MAC can not be VNIC */
+	ASSERT(!((mip->mi_state_flags & MIS_IS_VNIC) && (vid != 0)));
+
+	/*
+	 * Check whether it's the primary client and flag it.
+	 */
+	if (!(mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary && vid == 0)
+		mcip->mci_flags |= MAC_CLIENT_FLAGS_PRIMARY;
+
+	/*
+	 * is_vnic_primary is true when we come here as a VLAN VNIC
+	 * which uses the primary mac client's address but with a non-zero
+	 * VID. In this case the MAC address is not specified by an upper
+	 * MAC client.
+	 */
+	if ((mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary &&
+	    !is_vnic_primary) {
+		/*
+		 * The address is being set by the upper MAC client
+		 * of a VNIC. The MAC address was already set by the
+		 * VNIC driver during VNIC creation.
+		 *
+		 * Note: a VNIC has only one MAC address. We return
+		 * the MAC unicast address handle of the lower MAC client
+		 * corresponding to the VNIC. We allocate a new entry
+		 * which is flagged appropriately, so that mac_unicast_remove()
+		 * doesn't attempt to free the original entry that
+		 * was allocated by the VNIC driver.
+		 */
+		ASSERT(mcip->mci_unicast != NULL);
+
+		/*
+		 * Ensure that the primary unicast address of the VNIC
+		 * is added only once.
+		 */
+		if (mcip->mci_flags & MAC_CLIENT_FLAGS_VNIC_PRIMARY)
+			return (EBUSY);
+
+		mcip->mci_flags |= MAC_CLIENT_FLAGS_VNIC_PRIMARY;
+
+		/*
+		 * Create a handle for vid 0.
+		 */
+		ASSERT(vid == 0);
+		muip = kmem_zalloc(sizeof (mac_unicast_impl_t), KM_SLEEP);
+		muip->mui_vid = vid;
+		*mah = (mac_unicast_handle_t)muip;
+		return (0);
+	}
+
+	/* primary MAC clients cannot be opened on top of anchor VNICs */
+	if ((is_vnic_primary || is_primary) &&
+	    i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
+		return (ENXIO);
+	}
+
+	/*
+	 * Return EBUSY if:
+	 *  - this is an exclusive active mac client and there already exist
+	 *    active mac clients, or
+	 *  - there already exist an exclusively active mac client.
+	 */
+	if ((mcip->mci_state_flags & MCIS_EXCLUSIVE) &&
+	    (mip->mi_nactiveclients != 0) || (mip->mi_state_flags &
+	    MIS_EXCLUSIVE)) {
+		return (EBUSY);
+	}
+
+	if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
+		mip->mi_state_flags |= MIS_EXCLUSIVE;
+
+	bzero(&mrp, sizeof (mac_resource_props_t));
+	if (is_primary && !(mcip->mci_state_flags & MCIS_IS_VNIC)) {
+		/*
+		 * Apply the property cached in the mac_impl_t to the primary
+		 * mac client. If the mac client is a VNIC, its property were
+		 * already set in the mcip when the VNIC was created.
+		 */
+		mac_get_resources((mac_handle_t)mip, &mrp);
+		(void) mac_client_set_resources(mch, &mrp);
+	} else if (mcip->mci_state_flags & MCIS_IS_VNIC) {
+		bcopy(MCIP_RESOURCE_PROPS(mcip), &mrp,
+		    sizeof (mac_resource_props_t));
+	}
+
+	muip = kmem_zalloc(sizeof (mac_unicast_impl_t), KM_SLEEP);
+	muip->mui_vid = vid;
+
+	if (is_primary || is_vnic_primary) {
+		mac_addr = mip->mi_addr;
+		check_dups = B_TRUE;
+	} else {
+
+		/*
+		 * Verify the validity of the specified MAC addresses value.
+		 */
+		if (!mac_unicst_verify((mac_handle_t)mip, mac_addr, mac_len)) {
+			*diag = MAC_DIAG_MACADDR_INVALID;
+			err = EINVAL;
+			goto bail;
+		}
+
+		/*
+		 * Make sure that the specified MAC address is different
+		 * than the unicast MAC address of the underlying NIC.
+		 */
+		if (check_dups && bcmp(mip->mi_addr, mac_addr, mac_len) == 0) {
+			*diag = MAC_DIAG_MACADDR_NIC;
+			err = EINVAL;
+			goto bail;
+		}
+	}
+
+	/*
+	 * Make sure the MAC address is not already used by
+	 * another MAC client defined on top of the same
+	 * underlying NIC.
+	 * xxx-venu mac_unicast_add doesnt' seem to be called
+	 * with MAC_UNICAST_NODUPCHECK currently, if it does
+	 * get called we need to do mac_addr_in_use() just
+	 * to check for addr_in_use till 6697876 is fixed.
+	 */
+	if (check_dups && mac_addr_in_use(mip, mac_addr, vid)) {
+		*diag = MAC_DIAG_MACADDR_INUSE;
+		err = EEXIST;
+		goto bail;
+	}
+
+	if ((err = mac_start(mip)) != 0)
+		goto bail;
+
+	mac_started = B_TRUE;
+
+	/* add the MAC client to the broadcast address group by default */
+	if (mip->mi_type->mt_brdcst_addr != NULL) {
+		err = mac_bcast_add(mcip, mip->mi_type->mt_brdcst_addr, vid,
+		    MAC_ADDRTYPE_BROADCAST);
+		if (err != 0)
+			goto bail;
+		bcast_added = B_TRUE;
+	}
+	flent = mcip->mci_flent;
+	ASSERT(flent != NULL);
+	/* We are configuring the unicast flow now */
+	if (!MCIP_DATAPATH_SETUP(mcip)) {
+
+		MAC_CLIENT_SET_PRIORITY_RANGE(mcip,
+		    (mrp.mrp_mask & MRP_PRIORITY) ? mrp.mrp_priority :
+		    MPL_LINK_DEFAULT);
+
+		if ((err = mac_unicast_flow_create(mcip, mac_addr, vid,
+		    is_primary || is_vnic_primary, B_TRUE, &flent, &mrp)) != 0)
+			goto bail;
+
+		mip->mi_nactiveclients++;
+		nactiveclients_added = B_TRUE;
+		/*
+		 * This will allocate the RX ring group if possible for the
+		 * flow and program the software classifier as needed.
+		 */
+		if ((err = mac_datapath_setup(mcip, flent, SRST_LINK)) != 0)
+			goto bail;
+
+		/*
+		 * The unicast MAC address must have been added successfully.
+		 */
+		ASSERT(mcip->mci_unicast != NULL);
+	} else {
+		mac_address_t *map = mcip->mci_unicast;
+
+		/*
+		 * A unicast flow already exists for that MAC client,
+		 * this flow must be the same mac address but with
+		 * different VID. It has been checked by mac_addr_in_use().
+		 *
+		 * We will use the SRS etc. from the mci_flent. Note that
+		 * We don't need to create kstat for this as except for
+		 * the fdesc, everything will be used from in the 1st flent.
+		 */
+
+		if (bcmp(mac_addr, map->ma_addr, map->ma_len) != 0) {
+			err = EINVAL;
+			goto bail;
+		}
+
+		if ((err = mac_unicast_flow_create(mcip, mac_addr, vid,
+		    is_primary || is_vnic_primary, B_FALSE, &flent, NULL)) != 0)
+			goto bail;
+
+		if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0) {
+			FLOW_FINAL_REFRELE(flent);
+			goto bail;
+		}
+
+		/* update the multicast group for this vid */
+		mac_client_bcast_refresh(mcip, mac_client_update_mcast,
+		    (void *)flent, B_TRUE);
+
+	}
+
+	/* populate the shared MAC address */
+	muip->mui_map = mcip->mci_unicast;
+
+	rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+	muip->mui_next = mcip->mci_unicast_list;
+	mcip->mci_unicast_list = muip;
+	rw_exit(&mcip->mci_rw_lock);
+
+	*mah = (mac_unicast_handle_t)muip;
+
+	/* add it to the flow list of this mcip */
+	mac_client_add_to_flow_list(mcip, flent);
+
+	/*
+	 * Trigger a renegotiation of the capabilities when the number of
+	 * active clients changes from 1 to 2, since some of the capabilities
+	 * might have to be disabled. Also send a MAC_NOTE_LINK notification
+	 * to all the MAC clients whenever physical link is DOWN.
+	 */
+	if (mip->mi_nactiveclients == 2) {
+		mac_capab_update((mac_handle_t)mip);
+		mac_virtual_link_update(mip);
+	}
+	/*
+	 * Now that the setup is complete, clear the INCIPIENT flag.
+	 * The flag was set to avoid incoming packets seeing inconsistent
+	 * structures while the setup was in progress. Clear the mci_tx_flag
+	 * by calling mac_tx_client_block. It is possible that
+	 * mac_unicast_remove was called prior to this mac_unicast_add which
+	 * could have set the MCI_TX_QUIESCE flag.
+	 */
+	if (flent->fe_rx_ring_group != NULL)
+		mac_rx_group_unmark(flent->fe_rx_ring_group, MR_INCIPIENT);
+	FLOW_UNMARK(flent, FE_INCIPIENT);
+	FLOW_UNMARK(flent, FE_MC_NO_DATAPATH);
+	mac_tx_client_unblock(mcip);
+	return (0);
+bail:
+	if (bcast_added)
+		mac_bcast_delete(mcip, mip->mi_type->mt_brdcst_addr, vid);
+	if (mac_started)
+		mac_stop(mip);
+
+	if (nactiveclients_added)
+		mip->mi_nactiveclients--;
+	if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
+		mip->mi_state_flags &= ~MIS_EXCLUSIVE;
+	kmem_free(muip, sizeof (mac_unicast_impl_t));
+	return (err);
+}
+
+int
+mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags,
+    mac_unicast_handle_t *mah, uint16_t vid, mac_diag_t *diag)
+{
+	mac_impl_t *mip = ((mac_client_impl_t *)mch)->mci_mip;
+	uint_t err;
+
+	i_mac_perim_enter(mip);
+	err = i_mac_unicast_add(mch, mac_addr, flags, mah, vid, diag);
+	i_mac_perim_exit(mip);
+
+	return (err);
+}
+
+/*
+ * Add the primary MAC address to the MAC client. This is a convenience
+ * function which can be called by primary MAC clients which do not
+ * need to specify any other additional flags.
+ *
+ * It's called in one of following situations:
+ *   * dls as the primary MAC client
+ *   * aggr as an exclusive client
+ *   * by VNIC's client
+ */
+int
+mac_unicast_primary_add(mac_client_handle_t mch, mac_unicast_handle_t *mah,
+    mac_diag_t *diag)
+{
+	return (mac_unicast_add(mch, NULL, MAC_UNICAST_PRIMARY, mah, 0, diag));
+}
+
+/*
+ * Remove a MAC address which was previously added by mac_unicast_add().
+ */
+int
+mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_unicast_impl_t *muip = (mac_unicast_impl_t *)mah;
+	mac_unicast_impl_t *pre;
+	mac_impl_t *mip = mcip->mci_mip;
+	flow_entry_t *flent;
+
+	i_mac_perim_enter(mip);
+	if (mcip->mci_flags & MAC_CLIENT_FLAGS_VNIC_PRIMARY) {
+		/*
+		 * Called made by the upper MAC client of a VNIC.
+		 * There's nothing much to do, the unicast address will
+		 * be removed by the VNIC driver when the VNIC is deleted,
+		 * but let's ensure that all our transmit is done before
+		 * the client does a mac_client_stop lest it trigger an
+		 * assert in the driver.
+		 */
+		ASSERT(muip->mui_vid == 0);
+
+		mac_tx_client_flush(mcip);
+		mcip->mci_flags &= ~MAC_CLIENT_FLAGS_VNIC_PRIMARY;
+
+		kmem_free(muip, sizeof (mac_unicast_impl_t));
+		i_mac_perim_exit(mip);
+		return (0);
+	}
+
+	ASSERT(muip != NULL);
+
+	/*
+	 * Remove the VID from the list of client's VIDs.
+	 */
+	pre = mcip->mci_unicast_list;
+	if (muip == pre)
+		mcip->mci_unicast_list = muip->mui_next;
+	else {
+		while ((pre->mui_next != NULL) && (pre->mui_next != muip))
+			pre = pre->mui_next;
+		ASSERT(pre->mui_next == muip);
+		rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+		pre->mui_next = muip->mui_next;
+		rw_exit(&mcip->mci_rw_lock);
+	}
+
+	if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) && muip->mui_vid == 0)
+		mcip->mci_flags &= ~MAC_CLIENT_FLAGS_PRIMARY;
+
+	/*
+	 * This MAC client is shared, so we will just remove the flent
+	 * corresponding to the address being removed. We don't invoke
+	 * mac_rx_classify_flow_rem() since the additional flow is
+	 * not associated with its own separate set of SRS and rings,
+	 * and these constructs are still needed for the remaining flows.
+	 */
+	if (!mac_client_single_rcvr(mcip)) {
+		flent = mac_client_get_flow(mcip, muip);
+		ASSERT(flent != NULL);
+
+		/*
+		 * The first one is disappearing, need to make sure
+		 * we replace it with another from the list of
+		 * shared clients.
+		 */
+		if (flent == mcip->mci_flent)
+			flent = mac_client_swap_mciflent(mcip);
+		mac_client_remove_flow_from_list(mcip, flent);
+		mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE);
+		mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
+
+		/*
+		 * The multicast groups that were added by the client so
+		 * far must be removed from the brodcast domain corresponding
+		 * to the VID being removed.
+		 */
+		mac_client_bcast_refresh(mcip, mac_client_update_mcast,
+		    (void *)flent, B_FALSE);
+
+		if (mip->mi_type->mt_brdcst_addr != NULL) {
+			mac_bcast_delete(mcip, mip->mi_type->mt_brdcst_addr,
+			    muip->mui_vid);
+		}
+		mac_stop(mip);
+		FLOW_FINAL_REFRELE(flent);
+		i_mac_perim_exit(mip);
+		return (0);
+	}
+
+	mip->mi_nactiveclients--;
+
+	/* Tear down the Data path */
+	mac_datapath_teardown(mcip, mcip->mci_flent, SRST_LINK);
+
+	/*
+	 * Prevent any future access to the flow entry through the mci_flent
+	 * pointer by setting the mci_flent to NULL. Access to mci_flent in
+	 * mac_bcast_send is also under mi_rw_lock.
+	 */
+	rw_enter(&mip->mi_rw_lock, RW_WRITER);
+	flent = mcip->mci_flent;
+	mac_client_remove_flow_from_list(mcip, flent);
+
+	if (mcip->mci_state_flags & MCIS_DESC_LOGGED)
+		mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
+
+	/*
+	 * This is the last unicast address being removed and there shouldn't
+	 * be any outbound data threads at this point coming down from mac
+	 * clients. We have waited for the data threads to finish before
+	 * starting dld_str_detach. Non-data threads must access TX SRS
+	 * under mi_rw_lock.
+	 */
+	rw_exit(&mip->mi_rw_lock);
+
+	/*
+	 * Update the multicast group for this vid.
+	 */
+	mac_client_bcast_refresh(mcip, mac_client_update_mcast, (void *)flent,
+	    B_FALSE);
+
+	/*
+	 * Don't use FLOW_MARK with FE_MC_NO_DATAPATH, as the flow might
+	 * contain other flags, such as FE_CONDEMNED, which we need to
+	 * cleared. We don't call mac_flow_cleanup() for this unicast
+	 * flow as we have a already cleaned up SRSs etc. (via the teadown
+	 * path). We just clear the stats and reset the initial callback
+	 * function, the rest will be set when we call mac_flow_create,
+	 * if at all.
+	 */
+	mutex_enter(&flent->fe_lock);
+	ASSERT(flent->fe_refcnt == 1 && flent->fe_mbg == NULL &&
+	    flent->fe_tx_srs == NULL && flent->fe_rx_srs_cnt == 0);
+	flent->fe_flags = FE_MC_NO_DATAPATH;
+	flow_stat_destroy(flent);
+
+	/* Initialize the receiver function to a safe routine */
+	flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+	flent->fe_cb_arg1 = NULL;
+	flent->fe_cb_arg2 = NULL;
+
+	flent->fe_index = -1;
+	mutex_exit(&flent->fe_lock);
+
+	if (mip->mi_type->mt_brdcst_addr != NULL) {
+		mac_bcast_delete(mcip, mip->mi_type->mt_brdcst_addr,
+		    muip->mui_vid);
+	}
+
+	if (mip->mi_nactiveclients == 1) {
+		mac_capab_update((mac_handle_t)mip);
+		mac_virtual_link_update(mip);
+	}
+	if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
+		mip->mi_state_flags &= ~MIS_EXCLUSIVE;
+
+	mac_stop(mip);
+
+	i_mac_perim_exit(mip);
+	kmem_free(muip, sizeof (mac_unicast_impl_t));
+	return (0);
+}
+
+/*
+ * Multicast add function invoked by MAC clients.
+ */
+int
+mac_multicast_add(mac_client_handle_t mch, const uint8_t *addr)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_impl_t		*mip = mcip->mci_mip;
+	flow_entry_t		*flent = mcip->mci_flent_list;
+	flow_entry_t		*prev_fe = NULL;
+	uint16_t		vid;
+	int			err = 0;
+
+	/* Verify the address is a valid multicast address */
+	if ((err = mip->mi_type->mt_ops.mtops_multicst_verify(addr,
+	    mip->mi_pdata)) != 0)
+		return (err);
+
+	i_mac_perim_enter(mip);
+	while (flent != NULL) {
+		vid = i_mac_flow_vid(flent);
+
+		err = mac_bcast_add((mac_client_impl_t *)mch, addr, vid,
+		    MAC_ADDRTYPE_MULTICAST);
+		if (err != 0)
+			break;
+		prev_fe = flent;
+		flent = flent->fe_client_next;
+	}
+
+	/*
+	 * If we failed adding, then undo all, rather than partial
+	 * success.
+	 */
+	if (flent != NULL && prev_fe != NULL) {
+		flent = mcip->mci_flent_list;
+		while (flent != prev_fe->fe_client_next) {
+			vid = i_mac_flow_vid(flent);
+			mac_bcast_delete((mac_client_impl_t *)mch, addr, vid);
+			flent = flent->fe_client_next;
+		}
+	}
+	i_mac_perim_exit(mip);
+	return (err);
+}
+
+/*
+ * Multicast delete function invoked by MAC clients.
+ */
+void
+mac_multicast_remove(mac_client_handle_t mch, const uint8_t *addr)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_impl_t		*mip = mcip->mci_mip;
+	flow_entry_t		*flent;
+	uint16_t		vid;
+
+	i_mac_perim_enter(mip);
+	for (flent = mcip->mci_flent_list; flent != NULL;
+	    flent = flent->fe_client_next) {
+		vid = i_mac_flow_vid(flent);
+		mac_bcast_delete((mac_client_impl_t *)mch, addr, vid);
+	}
+	i_mac_perim_exit(mip);
+}
+
+/*
+ * When a MAC client desires to capture packets on an interface,
+ * it registers a promiscuous call back with mac_promisc_add().
+ * There are three types of promiscuous callbacks:
+ *
+ * * MAC_CLIENT_PROMISC_ALL
+ *   Captures all packets sent and received by the MAC client,
+ *   the physical interface, as well as all other MAC clients
+ *   defined on top of the same MAC.
+ *
+ * * MAC_CLIENT_PROMISC_FILTERED
+ *   Captures all packets sent and received by the MAC client,
+ *   plus all multicast traffic sent and received by the phyisical
+ *   interface and the other MAC clients.
+ *
+ * * MAC_CLIENT_PROMISC_MULTI
+ *   Captures all broadcast and multicast packets sent and
+ *   received by the MAC clients as well as the physical interface.
+ *
+ * In all cases, the underlying MAC is put in promiscuous mode.
+ */
+int
+mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
+    mac_rx_t fn, void *arg, mac_promisc_handle_t *mphp, uint16_t flags)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_impl_t *mip = mcip->mci_mip;
+	mac_promisc_impl_t *mpip;
+	mac_cb_info_t	*mcbi;
+	int rc;
+
+	i_mac_perim_enter(mip);
+
+	if ((rc = mac_start(mip)) != 0) {
+		i_mac_perim_exit(mip);
+		return (rc);
+	}
+
+	if ((mcip->mci_state_flags & MCIS_IS_VNIC) &&
+	    type == MAC_CLIENT_PROMISC_ALL) {
+		/*
+		 * The function is being invoked by the upper MAC client
+		 * of a VNIC. The VNIC should only see the traffic
+		 * it is entitled to.
+		 */
+		type = MAC_CLIENT_PROMISC_FILTERED;
+	}
+
+
+	/*
+	 * Turn on promiscuous mode for the underlying NIC.
+	 * This is needed even for filtered callbacks which
+	 * expect to receive all multicast traffic on the wire.
+	 *
+	 * Physical promiscuous mode should not be turned on if
+	 * MAC_PROMISC_FLAGS_NO_PHYS is set.
+	 */
+	if ((flags & MAC_PROMISC_FLAGS_NO_PHYS) == 0) {
+		if ((rc = i_mac_promisc_set(mip, B_TRUE, MAC_DEVPROMISC))
+		    != 0) {
+			mac_stop(mip);
+			i_mac_perim_exit(mip);
+			return (rc);
+		}
+	}
+
+	mpip = kmem_cache_alloc(mac_promisc_impl_cache, KM_SLEEP);
+
+	mpip->mpi_type = type;
+	mpip->mpi_fn = fn;
+	mpip->mpi_arg = arg;
+	mpip->mpi_mcip = mcip;
+	mpip->mpi_no_tx_loop = ((flags & MAC_PROMISC_FLAGS_NO_TX_LOOP) != 0);
+	mpip->mpi_no_phys = ((flags & MAC_PROMISC_FLAGS_NO_PHYS) != 0);
+
+	mcbi = &mip->mi_promisc_cb_info;
+	mutex_enter(mcbi->mcbi_lockp);
+
+	mac_callback_add(&mip->mi_promisc_cb_info, &mcip->mci_promisc_list,
+	    &mpip->mpi_mci_link);
+	mac_callback_add(&mip->mi_promisc_cb_info, &mip->mi_promisc_list,
+	    &mpip->mpi_mi_link);
+
+	mutex_exit(mcbi->mcbi_lockp);
+
+	*mphp = (mac_promisc_handle_t)mpip;
+	i_mac_perim_exit(mip);
+	return (0);
+}
+
+/*
+ * Remove a multicast address previously aded through mac_promisc_add().
+ */
+int
+mac_promisc_remove(mac_promisc_handle_t mph)
+{
+	mac_promisc_impl_t *mpip = (mac_promisc_impl_t *)mph;
+	mac_client_impl_t *mcip = mpip->mpi_mcip;
+	mac_impl_t *mip = mcip->mci_mip;
+	mac_cb_info_t *mcbi;
+	int rc = 0;
+
+	i_mac_perim_enter(mip);
+
+	/*
+	 * Even if the device can't be reset into normal mode, we still
+	 * need to clear the client promisc callbacks. The client may want
+	 * to close the mac end point and we can't have stale callbacks.
+	 */
+	if (!(mpip->mpi_no_phys)) {
+		rc = mac_promisc_set((mac_handle_t)mip, B_FALSE,
+		    MAC_DEVPROMISC);
+		if (rc != 0)
+			goto done;
+	}
+	mcbi = &mip->mi_promisc_cb_info;
+	mutex_enter(mcbi->mcbi_lockp);
+	if (mac_callback_remove(mcbi, &mip->mi_promisc_list,
+	    &mpip->mpi_mi_link)) {
+		VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
+		    &mcip->mci_promisc_list, &mpip->mpi_mci_link));
+		kmem_cache_free(mac_promisc_impl_cache, mpip);
+	} else {
+		mac_callback_remove_wait(&mip->mi_promisc_cb_info);
+	}
+	mutex_exit(mcbi->mcbi_lockp);
+	mac_stop(mip);
+
+done:
+	i_mac_perim_exit(mip);
+	return (rc);
+}
+
+/*
+ * Reference count the number of active Tx threads. MCI_TX_QUIESCE indicates
+ * that a control operation wants to quiesce the Tx data flow in which case
+ * we return an error. Holding any of the per cpu locks ensures that the
+ * mci_tx_flag won't change.
+ *
+ * 'CPU' must be accessed just once and used to compute the index into the
+ * percpu array, and that index must be used for the entire duration of the
+ * packet send operation. Note that the thread may be preempted and run on
+ * another cpu any time and so we can't use 'CPU' more than once for the
+ * operation.
+ */
+#define	MAC_TX_TRY_HOLD(mcip, mytx, error)				\
+{									\
+	(error) = 0;							\
+	(mytx) = &(mcip)->mci_tx_pcpu[CPU->cpu_seqid & mac_tx_percpu_cnt]; \
+	mutex_enter(&(mytx)->pcpu_tx_lock);				\
+	if (!((mcip)->mci_tx_flag & MCI_TX_QUIESCE)) {			\
+		(mytx)->pcpu_tx_refcnt++;				\
+	} else {							\
+		(error) = -1;						\
+	}								\
+	mutex_exit(&(mytx)->pcpu_tx_lock);				\
+}
+
+/*
+ * Release the reference. If needed, signal any control operation waiting
+ * for Tx quiescence. The wait and signal are always done using the
+ * mci_tx_pcpu[0]'s lock
+ */
+#define	MAC_TX_RELE(mcip, mytx) {					\
+	mutex_enter(&(mytx)->pcpu_tx_lock);				\
+	if (--(mytx)->pcpu_tx_refcnt == 0 &&				\
+	    (mcip)->mci_tx_flag & MCI_TX_QUIESCE) {			\
+		mutex_exit(&(mytx)->pcpu_tx_lock);			\
+		mutex_enter(&(mcip)->mci_tx_pcpu[0].pcpu_tx_lock);	\
+		cv_signal(&(mcip)->mci_tx_cv);				\
+		mutex_exit(&(mcip)->mci_tx_pcpu[0].pcpu_tx_lock);	\
+	} else {							\
+		mutex_exit(&(mytx)->pcpu_tx_lock);			\
+	}								\
+}
+
+/*
+ * Bump the count of the number of active Tx threads. This is maintained as
+ * a per CPU counter. On (CMT kind of) machines with large number of CPUs,
+ * a single mci_tx_lock may become contended. However a count of the total
+ * number of Tx threads per client is needed in order to quiesce the Tx side
+ * prior to reassigning a Tx ring dynamically to another client. The thread
+ * that needs to quiesce the Tx traffic grabs all the percpu locks and checks
+ * the sum of the individual percpu refcnts. Each Tx data thread only grabs
+ * its own percpu lock and increments its own refcnt.
+ */
+void *
+mac_tx_hold(mac_client_handle_t mch)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_tx_percpu_t	*mytx;
+	int error;
+
+	MAC_TX_TRY_HOLD(mcip, mytx, error);
+	return (error == 0 ? (void *)mytx : NULL);
+}
+
+void
+mac_tx_rele(mac_client_handle_t mch, void *mytx_handle)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_tx_percpu_t	*mytx = mytx_handle;
+
+	MAC_TX_RELE(mcip, mytx)
+}
+
+/*
+ * Send function invoked by MAC clients.
+ */
+mac_tx_cookie_t
+mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint,
+    uint16_t flag, mblk_t **ret_mp)
+{
+	mac_tx_cookie_t		cookie;
+	int			error;
+	mac_tx_percpu_t		*mytx;
+	mac_soft_ring_set_t	*srs;
+	flow_entry_t		*flent;
+	boolean_t		is_subflow = B_FALSE;
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_impl_t		*mip = mcip->mci_mip;
+	mac_srs_tx_t		*srs_tx;
+
+	/*
+	 * Check whether the active Tx threads count is bumped already.
+	 */
+	if (!(flag & MAC_TX_NO_HOLD)) {
+		MAC_TX_TRY_HOLD(mcip, mytx, error);
+		if (error != 0) {
+			freemsgchain(mp_chain);
+			return (NULL);
+		}
+	}
+
+	if (mcip->mci_subflow_tab != NULL &&
+	    mcip->mci_subflow_tab->ft_flow_count > 0 &&
+	    mac_flow_lookup(mcip->mci_subflow_tab, mp_chain,
+	    FLOW_OUTBOUND, &flent) == 0) {
+		/*
+		 * The main assumption here is that if in the event
+		 * we get a chain, all the packets will be classified
+		 * to the same Flow/SRS. If this changes for any
+		 * reason, the following logic should change as well.
+		 * I suppose the fanout_hint also assumes this .
+		 */
+		ASSERT(flent != NULL);
+		is_subflow = B_TRUE;
+	} else {
+		flent = mcip->mci_flent;
+	}
+
+	srs = flent->fe_tx_srs;
+	srs_tx = &srs->srs_tx;
+	if (srs_tx->st_mode == SRS_TX_DEFAULT &&
+	    (srs->srs_state & SRS_ENQUEUED) == 0 &&
+	    mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL &&
+	    mp_chain->b_next == NULL) {
+		uint64_t	obytes;
+
+		/*
+		 * Since dls always opens the underlying MAC, nclients equals
+		 * to 1 means that the only active client is dls itself acting
+		 * as a primary client of the MAC instance. Since dls will not
+		 * send tagged packets in that case, and dls is trusted to send
+		 * packets for its allowed VLAN(s), the VLAN tag insertion and
+		 * check is required only if nclients is greater than 1.
+		 */
+		if (mip->mi_nclients > 1) {
+			if (MAC_VID_CHECK_NEEDED(mcip)) {
+				int	err = 0;
+
+				MAC_VID_CHECK(mcip, mp_chain, err);
+				if (err != 0) {
+					freemsg(mp_chain);
+					mcip->mci_stat_oerrors++;
+					goto done;
+				}
+			}
+			if (MAC_TAG_NEEDED(mcip)) {
+				mp_chain = mac_add_vlan_tag(mp_chain, 0,
+				    mac_client_vid(mch));
+				if (mp_chain == NULL) {
+					mcip->mci_stat_oerrors++;
+					goto done;
+				}
+			}
+		}
+
+		obytes = (mp_chain->b_cont == NULL ? MBLKL(mp_chain) :
+		    msgdsize(mp_chain));
+
+		MAC_TX(mip, srs_tx->st_arg2, mp_chain, mcip);
+
+		if (mp_chain == NULL) {
+			cookie = NULL;
+			mcip->mci_stat_obytes += obytes;
+			mcip->mci_stat_opackets += 1;
+			if ((srs->srs_type & SRST_FLOW) != 0) {
+				FLOW_STAT_UPDATE(flent, obytes, obytes);
+				FLOW_STAT_UPDATE(flent, opackets, 1);
+			}
+		} else {
+			mutex_enter(&srs->srs_lock);
+			cookie = mac_tx_srs_no_desc(srs, mp_chain,
+			    flag, ret_mp);
+			mutex_exit(&srs->srs_lock);
+		}
+	} else {
+		cookie = srs_tx->st_func(srs, mp_chain, hint, flag, ret_mp);
+	}
+
+done:
+	if (is_subflow)
+		FLOW_REFRELE(flent);
+
+	if (!(flag & MAC_TX_NO_HOLD))
+		MAC_TX_RELE(mcip, mytx);
+
+	return (cookie);
+}
+
+/*
+ * mac_tx_is_blocked
+ *
+ * Given a cookie, it returns if the ring identified by the cookie is
+ * flow-controlled or not (this is not implemented yet). If NULL is
+ * passed in place of a cookie, then it finds out if any of the
+ * underlying rings belonging to the SRS is flow controlled or not
+ * and returns that status.
+ */
+/* ARGSUSED */
+boolean_t
+mac_tx_is_flow_blocked(mac_client_handle_t mch, mac_tx_cookie_t cookie)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_soft_ring_set_t *mac_srs = MCIP_TX_SRS(mcip);
+	mac_soft_ring_t *sringp;
+	boolean_t blocked = B_FALSE;
+	int i;
+
+	/*
+	 * On etherstubs, there won't be a Tx SRS or an Rx
+	 * SRS. Infact there won't even be a flow_entry.
+	 */
+	if (mac_srs == NULL)
+		return (B_FALSE);
+
+	mutex_enter(&mac_srs->srs_lock);
+	if (mac_srs->srs_tx.st_mode == SRS_TX_FANOUT) {
+		for (i = 0; i < mac_srs->srs_oth_ring_count; i++) {
+			sringp = mac_srs->srs_oth_soft_rings[i];
+			mutex_enter(&sringp->s_ring_lock);
+			if (sringp->s_ring_state & S_RING_TX_HIWAT) {
+				blocked = B_TRUE;
+				mutex_exit(&sringp->s_ring_lock);
+				break;
+			}
+			mutex_exit(&sringp->s_ring_lock);
+		}
+	} else {
+		blocked = (mac_srs->srs_state & SRS_TX_HIWAT);
+	}
+	mutex_exit(&mac_srs->srs_lock);
+	return (blocked);
+}
+
+/*
+ * Check if the MAC client is the primary MAC client.
+ */
+boolean_t
+mac_is_primary_client(mac_client_impl_t *mcip)
+{
+	return (mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY);
+}
+
+void
+mac_ioctl(mac_handle_t mh, queue_t *wq, mblk_t *bp)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+	int cmd = ((struct iocblk *)bp->b_rptr)->ioc_cmd;
+
+	if ((cmd == ND_GET && (mip->mi_callbacks->mc_callbacks & MC_GETPROP)) ||
+	    (cmd == ND_SET && (mip->mi_callbacks->mc_callbacks & MC_SETPROP))) {
+		/*
+		 * If ndd props were registered, call them.
+		 * Note that ndd ioctls are Obsolete
+		 */
+		mac_ndd_ioctl(mip, wq, bp);
+		return;
+	}
+
+	/*
+	 * Call the driver to handle the ioctl.  The driver may not support
+	 * any ioctls, in which case we reply with a NAK on its behalf.
+	 */
+	if (mip->mi_callbacks->mc_callbacks & MC_IOCTL)
+		mip->mi_ioctl(mip->mi_driver, wq, bp);
+	else
+		miocnak(wq, bp, 0, EINVAL);
+}
+
+/*
+ * Return the link state of the specified MAC instance.
+ */
+link_state_t
+mac_link_get(mac_handle_t mh)
+{
+	return (((mac_impl_t *)mh)->mi_linkstate);
+}
+
+/*
+ * Add a mac client specified notification callback. Please see the comments
+ * above mac_callback_add() for general information about mac callback
+ * addition/deletion in the presence of mac callback list walkers
+ */
+mac_notify_handle_t
+mac_notify_add(mac_handle_t mh, mac_notify_t notify_fn, void *arg)
+{
+	mac_impl_t		*mip = (mac_impl_t *)mh;
+	mac_notify_cb_t		*mncb;
+	mac_cb_info_t		*mcbi;
+
+	/*
+	 * Allocate a notify callback structure, fill in the details and
+	 * use the mac callback list manipulation functions to chain into
+	 * the list of callbacks.
+	 */
+	mncb = kmem_zalloc(sizeof (mac_notify_cb_t), KM_SLEEP);
+	mncb->mncb_fn = notify_fn;
+	mncb->mncb_arg = arg;
+	mncb->mncb_mip = mip;
+	mncb->mncb_link.mcb_objp = mncb;
+	mncb->mncb_link.mcb_objsize = sizeof (mac_notify_cb_t);
+	mncb->mncb_link.mcb_flags = MCB_NOTIFY_CB_T;
+
+	mcbi = &mip->mi_notify_cb_info;
+
+	i_mac_perim_enter(mip);
+	mutex_enter(mcbi->mcbi_lockp);
+
+	mac_callback_add(&mip->mi_notify_cb_info, &mip->mi_notify_cb_list,
+	    &mncb->mncb_link);
+
+	mutex_exit(mcbi->mcbi_lockp);
+	i_mac_perim_exit(mip);
+	return ((mac_notify_handle_t)mncb);
+}
+
+void
+mac_notify_remove_wait(mac_handle_t mh)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+	mac_cb_info_t	*mcbi = &mip->mi_notify_cb_info;
+
+	mutex_enter(mcbi->mcbi_lockp);
+	mac_callback_remove_wait(&mip->mi_notify_cb_info);
+	mutex_exit(mcbi->mcbi_lockp);
+}
+
+/*
+ * Remove a mac client specified notification callback
+ */
+int
+mac_notify_remove(mac_notify_handle_t mnh, boolean_t wait)
+{
+	mac_notify_cb_t	*mncb = (mac_notify_cb_t *)mnh;
+	mac_impl_t	*mip = mncb->mncb_mip;
+	mac_cb_info_t	*mcbi;
+	int		err = 0;
+
+	mcbi = &mip->mi_notify_cb_info;
+
+	i_mac_perim_enter(mip);
+	mutex_enter(mcbi->mcbi_lockp);
+
+	ASSERT(mncb->mncb_link.mcb_objp == mncb);
+	/*
+	 * If there aren't any list walkers, the remove would succeed
+	 * inline, else we wait for the deferred remove to complete
+	 */
+	if (mac_callback_remove(&mip->mi_notify_cb_info,
+	    &mip->mi_notify_cb_list, &mncb->mncb_link)) {
+		kmem_free(mncb, sizeof (mac_notify_cb_t));
+	} else {
+		err = EBUSY;
+	}
+
+	mutex_exit(mcbi->mcbi_lockp);
+	i_mac_perim_exit(mip);
+
+	/*
+	 * If we failed to remove the notification callback and "wait" is set
+	 * to be B_TRUE, wait for the callback to finish after we exit the
+	 * mac perimeter.
+	 */
+	if (err != 0 && wait) {
+		mac_notify_remove_wait((mac_handle_t)mip);
+		return (0);
+	}
+
+	return (err);
+}
+
+/*
+ * Associate resource management callbacks with the specified MAC
+ * clients.
+ */
+
+void
+mac_resource_set_common(mac_client_handle_t mch, mac_resource_add_t add,
+    mac_resource_remove_t remove, mac_resource_quiesce_t quiesce,
+    mac_resource_restart_t restart, mac_resource_bind_t bind,
+    void *arg)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+
+	mcip->mci_resource_add = add;
+	mcip->mci_resource_remove = remove;
+	mcip->mci_resource_quiesce = quiesce;
+	mcip->mci_resource_restart = restart;
+	mcip->mci_resource_bind = bind;
+	mcip->mci_resource_arg = arg;
+
+	if (arg == NULL)
+		mcip->mci_state_flags &= ~MCIS_CLIENT_POLL_CAPABLE;
+}
+
+void
+mac_resource_set(mac_client_handle_t mch, mac_resource_add_t add, void *arg)
+{
+	/* update the 'resource_add' callback */
+	mac_resource_set_common(mch, add, NULL, NULL, NULL, NULL, arg);
+}
+
+/*
+ * Sets up the client resources and enable the polling interface over all the
+ * SRS's and the soft rings of the client
+ */
+void
+mac_client_poll_enable(mac_client_handle_t mch)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_soft_ring_set_t	*mac_srs;
+	flow_entry_t		*flent;
+	int			i;
+
+	flent = mcip->mci_flent;
+	ASSERT(flent != NULL);
+
+	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
+		mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
+		ASSERT(mac_srs->srs_mcip == mcip);
+		mac_srs_client_poll_enable(mcip, mac_srs);
+	}
+}
+
+/*
+ * Tears down the client resources and disable the polling interface over all
+ * the SRS's and the soft rings of the client
+ */
+void
+mac_client_poll_disable(mac_client_handle_t mch)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_soft_ring_set_t	*mac_srs;
+	flow_entry_t		*flent;
+	int			i;
+
+	flent = mcip->mci_flent;
+	ASSERT(flent != NULL);
+
+	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
+		mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
+		ASSERT(mac_srs->srs_mcip == mcip);
+		mac_srs_client_poll_disable(mcip, mac_srs);
+	}
+}
+
+/*
+ * Associate the CPUs specified by the given property with a MAC client.
+ */
+int
+mac_cpu_set(mac_client_handle_t mch, mac_resource_props_t *mrp)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_impl_t *mip = mcip->mci_mip;
+	int err = 0;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	if ((err = mac_validate_props(mrp)) != 0)
+		return (err);
+
+	if (MCIP_DATAPATH_SETUP(mcip))
+		mac_flow_modify(mip->mi_flow_tab, mcip->mci_flent, mrp);
+
+	mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip), B_FALSE);
+	return (0);
+}
+
+/*
+ * Apply the specified properties to the specified MAC client.
+ */
+int
+mac_client_set_resources(mac_client_handle_t mch, mac_resource_props_t *mrp)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_impl_t *mip = mcip->mci_mip;
+	int err = 0;
+
+	i_mac_perim_enter(mip);
+
+	if ((mrp->mrp_mask & MRP_MAXBW) || (mrp->mrp_mask & MRP_PRIORITY)) {
+		err = mac_resource_ctl_set(mch, mrp);
+		if (err != 0) {
+			i_mac_perim_exit(mip);
+			return (err);
+		}
+	}
+
+	if (mrp->mrp_mask & MRP_CPUS)
+		err = mac_cpu_set(mch, mrp);
+
+	i_mac_perim_exit(mip);
+	return (err);
+}
+
+/*
+ * Return the properties currently associated with the specified MAC client.
+ */
+void
+mac_client_get_resources(mac_client_handle_t mch, mac_resource_props_t *mrp)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_resource_props_t	*mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
+
+	bcopy(mcip_mrp, mrp, sizeof (mac_resource_props_t));
+}
+
+/*
+ * Pass a copy of the specified packet to the promiscuous callbacks
+ * of the specified MAC.
+ *
+ * If sender is NULL, the function is being invoked for a packet chain
+ * received from the wire. If sender is non-NULL, it points to
+ * the MAC client from which the packet is being sent.
+ *
+ * The packets are distributed to the promiscuous callbacks as follows:
+ *
+ * - all packets are sent to the MAC_CLIENT_PROMISC_ALL callbacks
+ * - all broadcast and multicast packets are sent to the
+ *   MAC_CLIENT_PROMISC_FILTER and MAC_CLIENT_PROMISC_MULTI.
+ *
+ * The unicast packets of MAC_CLIENT_PROMISC_FILTER callbacks are dispatched
+ * after classification by mac_rx_deliver().
+ */
+
+static void
+mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
+    boolean_t loopback)
+{
+	mblk_t *mp_copy;
+
+	mp_copy = copymsg(mp);
+	if (mp_copy == NULL)
+		return;
+	mp_copy->b_next = NULL;
+
+	mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback);
+}
+
+/*
+ * Return the VID of a packet. Zero if the packet is not tagged.
+ */
+static uint16_t
+mac_ether_vid(mblk_t *mp)
+{
+	struct ether_header *eth = (struct ether_header *)mp->b_rptr;
+
+	if (ntohs(eth->ether_type) == ETHERTYPE_VLAN) {
+		struct ether_vlan_header *t_evhp =
+		    (struct ether_vlan_header *)mp->b_rptr;
+		return (VLAN_ID(ntohs(t_evhp->ether_tci)));
+	}
+
+	return (0);
+}
+
+/*
+ * Return whether the specified packet contains a multicast or broadcast
+ * destination MAC address.
+ */
+static boolean_t
+mac_is_mcast(mac_impl_t *mip, mblk_t *mp)
+{
+	mac_header_info_t hdr_info;
+
+	if (mac_header_info((mac_handle_t)mip, mp, &hdr_info) != 0)
+		return (B_FALSE);
+	return ((hdr_info.mhi_dsttype == MAC_ADDRTYPE_BROADCAST) ||
+	    (hdr_info.mhi_dsttype == MAC_ADDRTYPE_MULTICAST));
+}
+
+/*
+ * Send a copy of an mblk chain to the MAC clients of the specified MAC.
+ * "sender" points to the sender MAC client for outbound packets, and
+ * is set to NULL for inbound packets.
+ */
+void
+mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain,
+    mac_client_impl_t *sender)
+{
+	mac_promisc_impl_t *mpip;
+	mac_cb_t *mcb;
+	mblk_t *mp;
+	boolean_t is_mcast, is_sender;
+
+	MAC_PROMISC_WALKER_INC(mip);
+	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
+		is_mcast = mac_is_mcast(mip, mp);
+		/* send packet to interested callbacks */
+		for (mcb = mip->mi_promisc_list; mcb != NULL;
+		    mcb = mcb->mcb_nextp) {
+			mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
+			is_sender = (mpip->mpi_mcip == sender);
+
+			if (is_sender && mpip->mpi_no_tx_loop)
+				/*
+				 * The sender doesn't want to receive
+				 * copies of the packets it sends.
+				 */
+				continue;
+
+			/*
+			 * For an ethernet MAC, don't displatch a multicast
+			 * packet to a non-PROMISC_ALL callbacks unless the VID
+			 * of the packet matches the VID of the client.
+			 */
+			if (is_mcast &&
+			    mpip->mpi_type != MAC_CLIENT_PROMISC_ALL &&
+			    !mac_client_check_flow_vid(mpip->mpi_mcip,
+			    mac_ether_vid(mp)))
+				continue;
+
+			if (is_sender ||
+			    mpip->mpi_type == MAC_CLIENT_PROMISC_ALL ||
+			    is_mcast)
+				mac_promisc_dispatch_one(mpip, mp, is_sender);
+		}
+	}
+	MAC_PROMISC_WALKER_DCR(mip);
+}
+
+void
+mac_promisc_client_dispatch(mac_client_impl_t *mcip, mblk_t *mp_chain)
+{
+	mac_impl_t		*mip = mcip->mci_mip;
+	mac_promisc_impl_t	*mpip;
+	boolean_t		is_mcast;
+	mblk_t			*mp;
+	mac_cb_t		*mcb;
+
+	/*
+	 * The unicast packets for the MAC client still
+	 * need to be delivered to the MAC_CLIENT_PROMISC_FILTERED
+	 * promiscuous callbacks. The broadcast and multicast
+	 * packets were delivered from mac_rx().
+	 */
+	MAC_PROMISC_WALKER_INC(mip);
+	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
+		is_mcast = mac_is_mcast(mip, mp);
+		for (mcb = mcip->mci_promisc_list; mcb != NULL;
+		    mcb = mcb->mcb_nextp) {
+			mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
+			if (mpip->mpi_type == MAC_CLIENT_PROMISC_FILTERED &&
+			    !is_mcast) {
+				mac_promisc_dispatch_one(mpip, mp, B_FALSE);
+			}
+		}
+	}
+	MAC_PROMISC_WALKER_DCR(mip);
+}
+
+/*
+ * Return the margin value currently assigned to the specified MAC instance.
+ */
+void
+mac_margin_get(mac_handle_t mh, uint32_t *marginp)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	rw_enter(&(mip->mi_rw_lock), RW_READER);
+	*marginp = mip->mi_margin;
+	rw_exit(&(mip->mi_rw_lock));
+}
+
+/*
+ * mac_info_get() is used for retrieving the mac_info when a DL_INFO_REQ is
+ * issued before a DL_ATTACH_REQ. we walk the i_mac_impl_hash table and find
+ * the first mac_impl_t with a matching driver name; then we copy its mac_info_t
+ * to the caller. we do all this with i_mac_impl_lock held so the mac_impl_t
+ * cannot disappear while we are accessing it.
+ */
+typedef struct i_mac_info_state_s {
+	const char	*mi_name;
+	mac_info_t	*mi_infop;
+} i_mac_info_state_t;
+
+/*ARGSUSED*/
+static uint_t
+i_mac_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
+{
+	i_mac_info_state_t *statep = arg;
+	mac_impl_t *mip = (mac_impl_t *)val;
+
+	if (mip->mi_state_flags & MIS_DISABLED)
+		return (MH_WALK_CONTINUE);
+
+	if (strcmp(statep->mi_name,
+	    ddi_driver_name(mip->mi_dip)) != 0)
+		return (MH_WALK_CONTINUE);
+
+	statep->mi_infop = &mip->mi_info;
+	return (MH_WALK_TERMINATE);
+}
+
+boolean_t
+mac_info_get(const char *name, mac_info_t *minfop)
+{
+	i_mac_info_state_t state;
+
+	rw_enter(&i_mac_impl_lock, RW_READER);
+	state.mi_name = name;
+	state.mi_infop = NULL;
+	mod_hash_walk(i_mac_impl_hash, i_mac_info_walker, &state);
+	if (state.mi_infop == NULL) {
+		rw_exit(&i_mac_impl_lock);
+		return (B_FALSE);
+	}
+	*minfop = *state.mi_infop;
+	rw_exit(&i_mac_impl_lock);
+	return (B_TRUE);
+}
+
+/*
+ * To get the capabilities that MAC layer cares about, such as rings, factory
+ * mac address, vnic or not, it should directly invoke this function
+ */
+boolean_t
+i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB)
+		return (mip->mi_getcapab(mip->mi_driver, cap, cap_data));
+	else
+		return (B_FALSE);
+}
+
+/*
+ * Capability query function. If number of active mac clients is greater than
+ * 1, only limited capabilities can be advertised to the caller no matter the
+ * driver has certain capability or not. Else, we query the driver to get the
+ * capability.
+ */
+boolean_t
+mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	/*
+	 * if mi_nactiveclients > 1, only MAC_CAPAB_HCKSUM,
+	 * MAC_CAPAB_NO_NATIVEVLAN, MAC_CAPAB_NO_ZCOPY can be advertised.
+	 */
+	if (mip->mi_nactiveclients > 1) {
+		switch (cap) {
+		case MAC_CAPAB_HCKSUM:
+			return (i_mac_capab_get(mh, cap, cap_data));
+		case MAC_CAPAB_NO_NATIVEVLAN:
+		case MAC_CAPAB_NO_ZCOPY:
+			return (B_TRUE);
+		default:
+			return (B_FALSE);
+		}
+	}
+
+	/* else get capab from driver */
+	return (i_mac_capab_get(mh, cap, cap_data));
+}
+
+boolean_t
+mac_sap_verify(mac_handle_t mh, uint32_t sap, uint32_t *bind_sap)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	return (mip->mi_type->mt_ops.mtops_sap_verify(sap, bind_sap,
+	    mip->mi_pdata));
+}
+
+mblk_t *
+mac_header(mac_handle_t mh, const uint8_t *daddr, uint32_t sap, mblk_t *payload,
+    size_t extra_len)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	return (mip->mi_type->mt_ops.mtops_header(mip->mi_addr, daddr, sap,
+	    mip->mi_pdata, payload, extra_len));
+}
+
+int
+mac_header_info(mac_handle_t mh, mblk_t *mp, mac_header_info_t *mhip)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	return (mip->mi_type->mt_ops.mtops_header_info(mp, mip->mi_pdata,
+	    mhip));
+}
+
+mblk_t *
+mac_header_cook(mac_handle_t mh, mblk_t *mp)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	if (mip->mi_type->mt_ops.mtops_ops & MTOPS_HEADER_COOK) {
+		if (DB_REF(mp) > 1) {
+			mblk_t *newmp = copymsg(mp);
+			if (newmp == NULL)
+				return (NULL);
+			freemsg(mp);
+			mp = newmp;
+		}
+		return (mip->mi_type->mt_ops.mtops_header_cook(mp,
+		    mip->mi_pdata));
+	}
+	return (mp);
+}
+
+mblk_t *
+mac_header_uncook(mac_handle_t mh, mblk_t *mp)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	if (mip->mi_type->mt_ops.mtops_ops & MTOPS_HEADER_UNCOOK) {
+		if (DB_REF(mp) > 1) {
+			mblk_t *newmp = copymsg(mp);
+			if (newmp == NULL)
+				return (NULL);
+			freemsg(mp);
+			mp = newmp;
+		}
+		return (mip->mi_type->mt_ops.mtops_header_uncook(mp,
+		    mip->mi_pdata));
+	}
+	return (mp);
+}
+
+uint_t
+mac_addr_len(mac_handle_t mh)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	return (mip->mi_type->mt_addr_length);
+}
+
+/* True if a MAC is a VNIC */
+boolean_t
+mac_is_vnic(mac_handle_t mh)
+{
+	return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC);
+}
+
+mac_handle_t
+mac_get_lower_mac_handle(mac_handle_t mh)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	ASSERT(mac_is_vnic(mh));
+	return (((vnic_t *)mip->mi_driver)->vn_lower_mh);
+}
+
+void
+mac_update_resources(mac_resource_props_t *nmrp, mac_resource_props_t *cmrp,
+    boolean_t is_user_flow)
+{
+	if (nmrp != NULL && cmrp != NULL) {
+		if (nmrp->mrp_mask & MRP_PRIORITY) {
+			if (nmrp->mrp_priority == MPL_RESET) {
+				cmrp->mrp_mask &= ~MRP_PRIORITY;
+				if (is_user_flow) {
+					cmrp->mrp_priority =
+					    MPL_SUBFLOW_DEFAULT;
+				} else {
+					cmrp->mrp_priority = MPL_LINK_DEFAULT;
+				}
+			} else {
+				cmrp->mrp_mask |= MRP_PRIORITY;
+				cmrp->mrp_priority = nmrp->mrp_priority;
+			}
+		}
+		if (nmrp->mrp_mask & MRP_MAXBW) {
+			cmrp->mrp_maxbw = nmrp->mrp_maxbw;
+			if (nmrp->mrp_maxbw == MRP_MAXBW_RESETVAL)
+				cmrp->mrp_mask &= ~MRP_MAXBW;
+			else
+				cmrp->mrp_mask |= MRP_MAXBW;
+		}
+		if (nmrp->mrp_mask & MRP_CPUS)
+			MAC_COPY_CPUS(nmrp, cmrp);
+	}
+}
+
+/*
+ * i_mac_set_resources:
+ *
+ * This routine associates properties with the primary MAC client of
+ * the specified MAC instance.
+ * - Cache the properties in mac_impl_t
+ * - Apply the properties to the primary MAC client if exists
+ */
+int
+i_mac_set_resources(mac_handle_t mh, mac_resource_props_t *mrp)
+{
+	mac_impl_t		*mip = (mac_impl_t *)mh;
+	mac_client_impl_t	*mcip;
+	int			err = 0;
+	mac_resource_props_t	tmrp;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	err = mac_validate_props(mrp);
+	if (err != 0)
+		return (err);
+
+	/*
+	 * Since bind_cpu may be modified by mac_client_set_resources()
+	 * we use a copy of bind_cpu and finally cache bind_cpu in mip.
+	 * This allows us to cache only user edits in mip.
+	 */
+	bcopy(mrp, &tmrp, sizeof (mac_resource_props_t));
+	mcip = mac_primary_client_handle(mip);
+	if (mcip != NULL) {
+		err =
+		    mac_client_set_resources((mac_client_handle_t)mcip, &tmrp);
+	}
+	/* if mac_client_set_resources failed, do not update the values */
+	if (err == 0)
+		mac_update_resources(mrp, &mip->mi_resource_props, B_FALSE);
+	return (err);
+}
+
+int
+mac_set_resources(mac_handle_t mh, mac_resource_props_t *mrp)
+{
+	int err;
+
+	i_mac_perim_enter((mac_impl_t *)mh);
+	err = i_mac_set_resources(mh, mrp);
+	i_mac_perim_exit((mac_impl_t *)mh);
+	return (err);
+}
+
+/*
+ * Get the properties cached for the specified MAC instance.
+ */
+void
+mac_get_resources(mac_handle_t mh, mac_resource_props_t *mrp)
+{
+	mac_impl_t 		*mip = (mac_impl_t *)mh;
+	mac_client_impl_t	*mcip;
+
+	if (mip->mi_state_flags & MIS_IS_VNIC) {
+		mcip = mac_primary_client_handle(mip);
+		if (mcip != NULL) {
+			mac_client_get_resources((mac_client_handle_t)mcip,
+			    mrp);
+			return;
+		}
+	}
+	bcopy(&mip->mi_resource_props, mrp, sizeof (mac_resource_props_t));
+}
+
+/*
+ * Rename a mac client, its flow, and the kstat.
+ */
+int
+mac_rename_primary(mac_handle_t mh, const char *new_name)
+{
+	mac_impl_t		*mip = (mac_impl_t *)mh;
+	mac_client_impl_t	*cur_clnt = NULL;
+	flow_entry_t		*fep;
+
+	i_mac_perim_enter(mip);
+
+	/*
+	 * VNICs: we need to change the sys flow name and
+	 * the associated flow kstat.
+	 */
+	if (mip->mi_state_flags & MIS_IS_VNIC) {
+		ASSERT(new_name != NULL);
+		mac_rename_flow_names(mac_vnic_lower(mip), new_name);
+		goto done;
+	}
+	/*
+	 * This mac may itself be an aggr link, or it may have some client
+	 * which is an aggr port. For both cases, we need to change the
+	 * aggr port's mac client name, its flow name and the associated flow
+	 * kstat.
+	 */
+	if (mip->mi_state_flags & MIS_IS_AGGR) {
+		mac_capab_aggr_t aggr_cap;
+		mac_rename_fn_t rename_fn;
+		boolean_t ret;
+
+		ASSERT(new_name != NULL);
+		ret = i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR,
+		    (void *)(&aggr_cap));
+		ASSERT(ret == B_TRUE);
+		rename_fn = aggr_cap.mca_rename_fn;
+		rename_fn(new_name, mip->mi_driver);
+		/*
+		 * The aggr's client name and kstat flow name will be
+		 * updated below, i.e. via mac_rename_flow_names.
+		 */
+	}
+
+	for (cur_clnt = mip->mi_clients_list; cur_clnt != NULL;
+	    cur_clnt = cur_clnt->mci_client_next) {
+		if (cur_clnt->mci_state_flags & MCIS_IS_AGGR_PORT) {
+			if (new_name != NULL) {
+				char *str_st = cur_clnt->mci_name;
+				char *str_del = strchr(str_st, '-');
+
+				ASSERT(str_del != NULL);
+				bzero(str_del + 1, MAXNAMELEN -
+				    (str_del - str_st + 1));
+				bcopy(new_name, str_del + 1,
+				    strlen(new_name));
+			}
+			fep = cur_clnt->mci_flent;
+			mac_rename_flow(fep, cur_clnt->mci_name);
+			break;
+		} else if (new_name != NULL &&
+		    cur_clnt->mci_state_flags & MCIS_USE_DATALINK_NAME) {
+			mac_rename_flow_names(cur_clnt, new_name);
+			break;
+		}
+	}
+
+done:
+	i_mac_perim_exit(mip);
+	return (0);
+}
+
+/*
+ * Rename the MAC client's flow names
+ */
+static void
+mac_rename_flow_names(mac_client_impl_t *mcip, const char *new_name)
+{
+	flow_entry_t	*flent;
+	uint16_t	vid;
+	char		flowname[MAXFLOWNAME];
+	mac_impl_t	*mip = mcip->mci_mip;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	/*
+	 * Use mi_rw_lock to ensure that threads not in the mac perimeter
+	 * see a self-consistent value for mci_name
+	 */
+	rw_enter(&mip->mi_rw_lock, RW_WRITER);
+	(void) strlcpy(mcip->mci_name, new_name, sizeof (mcip->mci_name));
+	rw_exit(&mip->mi_rw_lock);
+
+	mac_rename_flow(mcip->mci_flent, new_name);
+
+	if (mcip->mci_nflents == 1)
+		return;
+
+	/*
+	 * We have to rename all the others too, no stats to destroy for
+	 * these.
+	 */
+	for (flent = mcip->mci_flent_list; flent != NULL;
+	    flent = flent->fe_client_next) {
+		if (flent != mcip->mci_flent) {
+			vid = i_mac_flow_vid(flent);
+			(void) sprintf(flowname, "%s%u", new_name, vid);
+			mac_flow_set_name(flent, flowname);
+		}
+	}
+}
+
+
+/*
+ * Add a flow to the MAC client's flow list - i.e list of MAC/VID tuples
+ * defined for the specified MAC client.
+ */
+static void
+mac_client_add_to_flow_list(mac_client_impl_t *mcip, flow_entry_t *flent)
+{
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+	/*
+	 * The promisc Rx data path walks the mci_flent_list. Protect by
+	 * using mi_rw_lock
+	 */
+	rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+
+	/* Add it to the head */
+	flent->fe_client_next = mcip->mci_flent_list;
+	mcip->mci_flent_list = flent;
+	mcip->mci_nflents++;
+
+	/*
+	 * Keep track of the number of non-zero VIDs addresses per MAC
+	 * client to avoid figuring it out in the data-path.
+	 */
+	if (i_mac_flow_vid(flent) != VLAN_ID_NONE)
+		mcip->mci_nvids++;
+
+	rw_exit(&mcip->mci_rw_lock);
+}
+
+/*
+ * Remove a flow entry from the MAC client's list.
+ */
+static void
+mac_client_remove_flow_from_list(mac_client_impl_t *mcip, flow_entry_t *flent)
+{
+	flow_entry_t	*fe = mcip->mci_flent_list;
+	flow_entry_t	*prev_fe = NULL;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+	/*
+	 * The promisc Rx data path walks the mci_flent_list. Protect by
+	 * using mci_rw_lock
+	 */
+	rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+	while ((fe != NULL) && (fe != flent)) {
+		prev_fe = fe;
+		fe = fe->fe_client_next;
+	}
+
+	/* XXX should be an ASSERT */
+	if (fe != NULL) {
+		if (prev_fe == NULL) {
+			/* Deleting the first node */
+			mcip->mci_flent_list = fe->fe_client_next;
+		} else {
+			prev_fe->fe_client_next = fe->fe_client_next;
+		}
+		mcip->mci_nflents--;
+
+		if (i_mac_flow_vid(flent) != VLAN_ID_NONE)
+			mcip->mci_nvids--;
+	}
+	rw_exit(&mcip->mci_rw_lock);
+}
+
+/*
+ * Check if the given VID belongs to this MAC client.
+ */
+boolean_t
+mac_client_check_flow_vid(mac_client_impl_t *mcip, uint16_t vid)
+{
+	flow_entry_t	*flent;
+	uint16_t	mci_vid;
+
+	/* The mci_flent_list is protected by mci_rw_lock */
+	rw_enter(&mcip->mci_rw_lock, RW_WRITER);
+	for (flent = mcip->mci_flent_list; flent != NULL;
+	    flent = flent->fe_client_next) {
+		mci_vid = i_mac_flow_vid(flent);
+		if (vid == mci_vid) {
+			rw_exit(&mcip->mci_rw_lock);
+			return (B_TRUE);
+		}
+	}
+	rw_exit(&mcip->mci_rw_lock);
+	return (B_FALSE);
+}
+
+/*
+ * Get the flow entry for the specified <MAC addr, VID> tuple.
+ */
+static flow_entry_t *
+mac_client_get_flow(mac_client_impl_t *mcip, mac_unicast_impl_t *muip)
+{
+	mac_address_t *map = mcip->mci_unicast;
+	flow_entry_t *flent;
+	uint16_t vid;
+	flow_desc_t flow_desc;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+	mac_flow_get_desc(mcip->mci_flent, &flow_desc);
+	if (bcmp(flow_desc.fd_dst_mac, map->ma_addr, map->ma_len) != 0)
+		return (NULL);
+
+	for (flent = mcip->mci_flent_list; flent != NULL;
+	    flent = flent->fe_client_next) {
+		vid = i_mac_flow_vid(flent);
+		if (vid == muip->mui_vid) {
+			return (flent);
+		}
+	}
+
+	return (NULL);
+}
+
+/*
+ * Since mci_flent has the SRSs, when we want to remove it, we replace
+ * the flow_desc_t in mci_flent with that of an existing flent and then
+ * remove that flent instead of mci_flent.
+ */
+static flow_entry_t *
+mac_client_swap_mciflent(mac_client_impl_t *mcip)
+{
+	flow_entry_t	*flent = mcip->mci_flent;
+	flow_tab_t	*ft = flent->fe_flow_tab;
+	flow_entry_t	*flent1;
+	flow_desc_t	fl_desc;
+	char		fl_name[MAXFLOWNAME];
+	int		err;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+	ASSERT(mcip->mci_nflents > 1);
+
+	/* get the next flent following the primary flent  */
+	flent1 = mcip->mci_flent_list->fe_client_next;
+	ASSERT(flent1 != NULL && flent1->fe_flow_tab == ft);
+
+	/*
+	 * Remove the flent from the flow table before updating the
+	 * flow descriptor as the hash depends on the flow descriptor.
+	 * This also helps incoming packet classification avoid having
+	 * to grab fe_lock. Access to fe_flow_desc of a flent not in the
+	 * flow table is done under the fe_lock so that log or stat functions
+	 * see a self-consistent fe_flow_desc. The name and desc are specific
+	 * to a flow, the rest are shared by all the clients, including
+	 * resource control etc.
+	 */
+	mac_flow_remove(ft, flent, B_TRUE);
+	mac_flow_remove(ft, flent1, B_TRUE);
+
+	bcopy(&flent->fe_flow_desc, &fl_desc, sizeof (flow_desc_t));
+	bcopy(flent->fe_flow_name, fl_name, MAXFLOWNAME);
+
+	/* update the primary flow entry */
+	mutex_enter(&flent->fe_lock);
+	bcopy(&flent1->fe_flow_desc, &flent->fe_flow_desc,
+	    sizeof (flow_desc_t));
+	bcopy(&flent1->fe_flow_name, &flent->fe_flow_name, MAXFLOWNAME);
+	mutex_exit(&flent->fe_lock);
+
+	/* update the flow entry that is to be freed */
+	mutex_enter(&flent1->fe_lock);
+	bcopy(&fl_desc, &flent1->fe_flow_desc, sizeof (flow_desc_t));
+	bcopy(fl_name, &flent1->fe_flow_name, MAXFLOWNAME);
+	mutex_exit(&flent1->fe_lock);
+
+	/* now reinsert the flow entries in the table */
+	err = mac_flow_add(ft, flent);
+	ASSERT(err == 0);
+
+	err = mac_flow_add(ft, flent1);
+	ASSERT(err == 0);
+
+	return (flent1);
+}
+
+/*
+ * Return whether there is only one flow entry associated with this
+ * MAC client.
+ */
+static boolean_t
+mac_client_single_rcvr(mac_client_impl_t *mcip)
+{
+	return (mcip->mci_nflents == 1);
+}
+
+int
+mac_validate_props(mac_resource_props_t *mrp)
+{
+	if (mrp == NULL)
+		return (0);
+
+	if (mrp->mrp_mask & MRP_PRIORITY) {
+		mac_priority_level_t	pri = mrp->mrp_priority;
+
+		if (pri < MPL_LOW || pri > MPL_RESET)
+			return (EINVAL);
+	}
+
+	if (mrp->mrp_mask & MRP_MAXBW) {
+		uint64_t maxbw = mrp->mrp_maxbw;
+
+		if (maxbw < MRP_MAXBW_MINVAL && maxbw != 0)
+			return (EINVAL);
+	}
+	if (mrp->mrp_mask & MRP_CPUS) {
+		int i;
+		mac_cpu_mode_t	fanout;
+
+		if (mrp->mrp_ncpus > ncpus || mrp->mrp_ncpus > MAX_SR_FANOUT)
+			return (EINVAL);
+
+		for (i = 0; i < mrp->mrp_ncpus; i++) {
+			cpu_t *cp;
+			int rv;
+
+			mutex_enter(&cpu_lock);
+			cp = cpu_get(mrp->mrp_cpu[i]);
+			if (cp != NULL)
+				rv = cpu_is_online(cp);
+			else
+				rv = 0;
+			mutex_exit(&cpu_lock);
+			if (rv == 0)
+				return (EINVAL);
+		}
+
+		fanout = mrp->mrp_fanout_mode;
+		if (fanout < 0 || fanout > MCM_CPUS)
+			return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * Send a MAC_NOTE_LINK notification to all the MAC clients whenever the
+ * underlying physical link is down. This is to allow MAC clients to
+ * communicate with other clients.
+ */
+void
+mac_virtual_link_update(mac_impl_t *mip)
+{
+	if (mip->mi_linkstate != LINK_STATE_UP)
+		i_mac_notify(mip, MAC_NOTE_LINK);
+}
+
+/*
+ * For clients that have a pass-thru MAC, e.g. VNIC, we set the VNIC's
+ * mac handle in the client.
+ */
+void
+mac_set_upper_mac(mac_client_handle_t mch, mac_handle_t mh)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+
+	mcip->mci_upper_mip = (mac_impl_t *)mh;
+}
+
+/*
+ * Mark the mac as being used exclusively by the single mac client that is
+ * doing some control operation on this mac. No further opens of this mac
+ * will be allowed until this client calls mac_unmark_exclusive. The mac
+ * client calling this function must already be in the mac perimeter
+ */
+int
+mac_mark_exclusive(mac_handle_t mh)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	ASSERT(MAC_PERIM_HELD(mh));
+	/*
+	 * Look up its entry in the global hash table.
+	 */
+	rw_enter(&i_mac_impl_lock, RW_WRITER);
+	if (mip->mi_state_flags & MIS_DISABLED) {
+		rw_exit(&i_mac_impl_lock);
+		return (ENOENT);
+	}
+
+	/*
+	 * A reference to mac is held even if the link is not plumbed.
+	 * In i_dls_link_create() we open the MAC interface and hold the
+	 * reference. There is an additional reference for the mac_open
+	 * done in acquiring the mac perimeter
+	 */
+	if (mip->mi_ref != 2) {
+		rw_exit(&i_mac_impl_lock);
+		return (EBUSY);
+	}
+
+	ASSERT(!(mip->mi_state_flags & MIS_EXCLUSIVE_HELD));
+	mip->mi_state_flags |= MIS_EXCLUSIVE_HELD;
+	rw_exit(&i_mac_impl_lock);
+	return (0);
+}
+
+void
+mac_unmark_exclusive(mac_handle_t mh)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	ASSERT(MAC_PERIM_HELD(mh));
+
+	rw_enter(&i_mac_impl_lock, RW_WRITER);
+	/* 1 for the creation and another for the perimeter */
+	ASSERT(mip->mi_ref == 2 && (mip->mi_state_flags & MIS_EXCLUSIVE_HELD));
+	mip->mi_state_flags &= ~MIS_EXCLUSIVE_HELD;
+	rw_exit(&i_mac_impl_lock);
+}
+
+/*
+ * Set the MTU for the specified device. The function returns EBUSY if
+ * another MAC client prevents the caller to become the exclusive client.
+ * Returns EAGAIN if the client is started.
+ */
+int
+mac_set_mtu(mac_handle_t mh, uint_t new_mtu, uint_t *old_mtu_arg)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+	uint_t old_mtu;
+	int rv;
+	boolean_t exclusive = B_FALSE;
+
+	i_mac_perim_enter(mip);
+
+	if ((mip->mi_callbacks->mc_callbacks & MC_SETPROP) == 0 ||
+	    (mip->mi_callbacks->mc_callbacks & MC_GETPROP) == 0) {
+		rv = ENOTSUP;
+		goto bail;
+	}
+
+	if ((rv = mac_mark_exclusive(mh)) != 0)
+		goto bail;
+	exclusive = B_TRUE;
+
+	if (mip->mi_active > 0) {
+		/*
+		 * The MAC instance is started, for example due to the
+		 * presence of a promiscuous clients. Fail the operation
+		 * since the MAC's MTU cannot be changed while the NIC
+		 * is started.
+		 */
+		rv = EAGAIN;
+		goto bail;
+	}
+
+	mac_sdu_get(mh, NULL, &old_mtu);
+
+	if (old_mtu != new_mtu) {
+		rv = mip->mi_callbacks->mc_setprop(mip->mi_driver,
+		    "mtu", MAC_PROP_MTU, sizeof (uint_t), &new_mtu);
+	}
+
+bail:
+	if (exclusive)
+		mac_unmark_exclusive(mh);
+	i_mac_perim_exit(mip);
+
+	if (rv == 0 && old_mtu_arg != NULL)
+		*old_mtu_arg = old_mtu;
+	return (rv);
+}
+
+void
+mac_get_hwgrp_info(mac_handle_t mh, int grp_index, uint_t *grp_num,
+    uint_t *n_rings, uint_t *type, uint_t *n_clnts, char *clnts_name)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+	mac_grp_client_t *mcip;
+	uint_t i = 0, index = 0;
+
+	/* Revisit when we implement fully dynamic group allocation */
+	ASSERT(grp_index >= 0 && grp_index < mip->mi_rx_group_count);
+
+	rw_enter(&mip->mi_rw_lock, RW_READER);
+	*grp_num = mip->mi_rx_groups[grp_index].mrg_index;
+	*type = mip->mi_rx_groups[grp_index].mrg_type;
+	*n_rings = mip->mi_rx_groups[grp_index].mrg_cur_count;
+	for (mcip = mip->mi_rx_groups[grp_index].mrg_clients; mcip != NULL;
+	    mcip = mcip->mgc_next) {
+		int name_len = strlen(mcip->mgc_client->mci_name);
+
+		/*
+		 * MAXCLIENTNAMELEN is the buffer size reserved for client
+		 * names.
+		 * XXXX Formating the client name string needs to be moved
+		 * to user land when fixing the size of dhi_clnts in
+		 * dld_hwgrpinfo_t. We should use n_clients * client_name for
+		 * dhi_clntsin instead of MAXCLIENTNAMELEN
+		 */
+		if (index + name_len >= MAXCLIENTNAMELEN) {
+			index = MAXCLIENTNAMELEN;
+			break;
+		}
+		bcopy(mcip->mgc_client->mci_name, &(clnts_name[index]),
+		    name_len);
+		index += name_len;
+		clnts_name[index++] = ',';
+		i++;
+	}
+
+	/* Get rid of the last , */
+	if (index > 0)
+		clnts_name[index - 1] = '\0';
+	*n_clnts = i;
+	rw_exit(&mip->mi_rw_lock);
+}
+
+uint_t
+mac_hwgrp_num(mac_handle_t mh)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	return (mip->mi_rx_group_count);
+}
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
new file mode 100644
index 0000000000..f265e53f13
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -0,0 +1,3347 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/callb.h>
+#include <sys/sdt.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/vlan.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ip_impl.h>
+#include <inet/sadb.h>
+#include <inet/ipsecesp.h>
+#include <inet/ipsecah.h>
+
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/mac_flow_impl.h>
+
+static void mac_srs_soft_rings_signal(mac_soft_ring_set_t *, uint_t);
+static void mac_srs_update_fanout_list(mac_soft_ring_set_t *);
+static void mac_srs_poll_unbind(mac_soft_ring_set_t *);
+static void mac_srs_worker_unbind(mac_soft_ring_set_t *);
+static void mac_srs_soft_rings_quiesce(mac_soft_ring_set_t *, uint_t);
+
+static int mac_srs_cpu_setup(cpu_setup_t, int, void *);
+static void mac_srs_worker_bind(mac_soft_ring_set_t *, processorid_t);
+static void mac_srs_poll_bind(mac_soft_ring_set_t *, processorid_t);
+static void mac_srs_threads_unbind(mac_soft_ring_set_t *);
+static void mac_srs_add_glist(mac_soft_ring_set_t *);
+static void mac_srs_remove_glist(mac_soft_ring_set_t *);
+static void mac_srs_fanout_list_free(mac_soft_ring_set_t *);
+static void mac_soft_ring_remove(mac_soft_ring_set_t *, mac_soft_ring_t *);
+
+static int mac_compute_soft_ring_count(flow_entry_t *, int);
+static void mac_walk_srs_and_bind(int);
+static void mac_walk_srs_and_unbind(int);
+
+extern mac_group_t *mac_reserve_rx_group(mac_client_impl_t *, uint8_t *,
+    mac_rx_group_reserve_type_t);
+extern void mac_release_rx_group(mac_client_impl_t *, mac_group_t *);
+
+extern boolean_t mac_latency_optimize;
+
+static kmem_cache_t *mac_srs_cache;
+kmem_cache_t *mac_soft_ring_cache;
+
+/*
+ * The duration in msec we wait before signalling the soft ring
+ * worker thread in case packets get queued.
+ */
+static uint32_t mac_soft_ring_worker_wait = 0;
+
+/*
+ * Need to set mac_soft_ring_max_q_cnt based on bandwidth and perhaps latency.
+ * Large values could end up in consuming lot of system memory and cause
+ * system hang.
+ */
+static int mac_soft_ring_max_q_cnt = 1024;
+static int mac_soft_ring_min_q_cnt = 256;
+static int mac_soft_ring_poll_thres = 16;
+
+/*
+ * Default value of number of TX rings to be assigned to a MAC client.
+ * If less than 'mac_tx_ring_count' worth of Tx rings is available, then
+ * as many as is available will be assigned to the newly created MAC client.
+ * If no TX rings are available, then MAC client(s) will be assigned the
+ * default Tx ring. Default Tx ring can be shared among multiple MAC clients.
+ */
+static uint32_t mac_tx_ring_count = 8;
+static boolean_t mac_tx_serialize = B_FALSE;
+
+/*
+ * mac_tx_srs_hiwat is the queue depth threshold at which callers of
+ * mac_tx() will be notified of flow control condition.
+ *
+ * TCP does not honour flow control condition sent up by mac_tx().
+ * Thus provision is made for TCP to allow more packets to be queued
+ * in SRS upto a maximum of mac_tx_srs_max_q_cnt.
+ *
+ * Note that mac_tx_srs_hiwat is always be lesser than
+ * mac_tx_srs_max_q_cnt.
+ */
+static uint32_t mac_tx_srs_max_q_cnt = 100000;
+static uint32_t mac_tx_srs_hiwat = 1000;
+
+/*
+ * mac_rx_soft_ring_count, mac_soft_ring_10gig_count:
+ *
+ * Global tunables that determines the number of soft rings to be used for
+ * fanning out incoming traffic on a link. These count will be used only
+ * when no explicit set of CPUs was assigned to the data-links.
+ *
+ * mac_rx_soft_ring_count tunable will come into effect only if
+ * mac_soft_ring_enable is set. mac_soft_ring_enable is turned on by
+ * default only for sun4v platforms.
+ *
+ * mac_rx_soft_ring_10gig_count will come into effect if you are running on a
+ * 10Gbps link and is not dependent upon mac_soft_ring_enable.
+ *
+ * The number of soft rings for fanout for a link or a flow is determined
+ * by mac_compute_soft_ring_count() routine. This routine will take into
+ * account mac_soft_ring_enable, mac_rx_soft_ring_count and
+ * mac_rx_soft_ring_10gig_count to determine the soft ring count for a link.
+ *
+ * If a bandwidth is specified, the determination of the number of soft
+ * rings is based on specified bandwidth, CPU speed and number of CPUs in
+ * the system.
+ */
+static uint_t mac_rx_soft_ring_count = 8;
+static uint_t mac_rx_soft_ring_10gig_count = 8;
+
+/*
+ * Every Tx and Rx mac_soft_ring_set_t (mac_srs) created gets added
+ * to mac_srs_g_list and mac_srs_g_lock protects mac_srs_g_list. The
+ * list is used to walk the list of all MAC threads when a CPU is
+ * coming online or going offline.
+ */
+static mac_soft_ring_set_t *mac_srs_g_list = NULL;
+static krwlock_t mac_srs_g_lock;
+
+/*
+ * Whether the SRS threads should be bound, or not.
+ */
+static boolean_t mac_srs_thread_bind = B_TRUE;
+
+/*
+ * CPU to fallback to, used by mac_next_bind_cpu().
+ */
+static processorid_t srs_bind_cpu = 0;
+
+/*
+ * Possible setting for soft_ring_process_flag is
+ * 0 or ST_RING_WORKER_ONLY.
+ */
+static int soft_ring_process_flag = ST_RING_WORKER_ONLY;
+
+/*
+ * If cpu bindings are specified by user, then Tx SRS and its soft
+ * rings should also be bound to the CPUs specified by user. The
+ * CPUs for Tx bindings are at the end of the cpu list provided by
+ * the user. If enough CPUs are not available (for Tx and Rx
+ * SRSes), then the CPUs are shared by both Tx and Rx SRSes.
+ */
+#define	BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp) {			\
+	processorid_t cpuid;						\
+	int i, j;							\
+	mac_soft_ring_t *softring;					\
+									\
+	cpuid = mrp->mrp_cpu[mrp->mrp_ncpus - 1];			\
+	mac_srs_worker_bind(mac_tx_srs, cpuid);			\
+	if (TX_MULTI_RING_MODE(mac_tx_srs)) {				\
+		j =  mrp->mrp_ncpus - 1;				\
+		for (i = 0;						\
+		    i < mac_tx_srs->srs_oth_ring_count; i++, j--) {	\
+			if (j < 0)					\
+				j =  mrp->mrp_ncpus - 1;		\
+			cpuid = mrp->mrp_cpu[j];			\
+			softring = mac_tx_srs->srs_oth_soft_rings[i];	\
+			(void) mac_soft_ring_bind(softring, cpuid);	\
+		}							\
+	}								\
+}
+
+/* INIT and FINI ROUTINES */
+
+void
+mac_soft_ring_init(void)
+{
+	mac_soft_ring_cache = kmem_cache_create("mac_soft_ring_cache",
+	    sizeof (mac_soft_ring_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
+
+	mac_srs_cache = kmem_cache_create("mac_srs_cache",
+	    sizeof (mac_soft_ring_set_t),
+	    64, NULL, NULL, NULL, NULL, NULL, 0);
+
+	rw_init(&mac_srs_g_lock, NULL, RW_DEFAULT, NULL);
+	mutex_enter(&cpu_lock);
+	register_cpu_setup_func(mac_srs_cpu_setup, NULL);
+	mutex_exit(&cpu_lock);
+}
+
+void
+mac_soft_ring_finish(void)
+{
+	mutex_enter(&cpu_lock);
+	unregister_cpu_setup_func(mac_srs_cpu_setup, NULL);
+	mutex_exit(&cpu_lock);
+	rw_destroy(&mac_srs_g_lock);
+	kmem_cache_destroy(mac_soft_ring_cache);
+	kmem_cache_destroy(mac_srs_cache);
+}
+
+static void
+mac_srs_soft_rings_free(mac_soft_ring_set_t *mac_srs, boolean_t release_tx_ring)
+{
+	mac_soft_ring_t	*softring, *next, *head;
+
+	/*
+	 * Synchronize with mac_walk_srs_bind/unbind which are callbacks from
+	 * DR. The callbacks from DR are called with cpu_lock held, and hence
+	 * can't wait to grab the mac perimeter. The soft ring list is hence
+	 * protected for read access by srs_lock. Changing the soft ring list
+	 * needs the mac perimeter and the srs_lock.
+	 */
+	mutex_enter(&mac_srs->srs_lock);
+
+	head = mac_srs->srs_soft_ring_head;
+	mac_srs->srs_soft_ring_head = NULL;
+	mac_srs->srs_soft_ring_tail = NULL;
+	mac_srs->srs_soft_ring_count = 0;
+
+	mutex_exit(&mac_srs->srs_lock);
+
+	for (softring = head; softring != NULL; softring = next) {
+		next = softring->s_ring_next;
+		mac_soft_ring_free(softring, release_tx_ring);
+	}
+}
+
+static void
+mac_srs_add_glist(mac_soft_ring_set_t *mac_srs)
+{
+	ASSERT(mac_srs->srs_next == NULL && mac_srs->srs_prev == NULL);
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip));
+
+	rw_enter(&mac_srs_g_lock, RW_WRITER);
+	mutex_enter(&mac_srs->srs_lock);
+
+	ASSERT((mac_srs->srs_state & SRS_IN_GLIST) == 0);
+
+	if (mac_srs_g_list == NULL) {
+		mac_srs_g_list = mac_srs;
+	} else {
+		mac_srs->srs_next = mac_srs_g_list;
+		mac_srs_g_list->srs_prev = mac_srs;
+		mac_srs->srs_prev = NULL;
+		mac_srs_g_list = mac_srs;
+	}
+	mac_srs->srs_state |= SRS_IN_GLIST;
+
+	mutex_exit(&mac_srs->srs_lock);
+	rw_exit(&mac_srs_g_lock);
+}
+
+static void
+mac_srs_remove_glist(mac_soft_ring_set_t *mac_srs)
+{
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip));
+
+	rw_enter(&mac_srs_g_lock, RW_WRITER);
+	mutex_enter(&mac_srs->srs_lock);
+
+	ASSERT((mac_srs->srs_state & SRS_IN_GLIST) != 0);
+
+	if (mac_srs == mac_srs_g_list) {
+		mac_srs_g_list = mac_srs->srs_next;
+		if (mac_srs_g_list != NULL)
+			mac_srs_g_list->srs_prev = NULL;
+	} else {
+		mac_srs->srs_prev->srs_next = mac_srs->srs_next;
+		if (mac_srs->srs_next != NULL)
+			mac_srs->srs_next->srs_prev = mac_srs->srs_prev;
+	}
+	mac_srs->srs_state &= ~SRS_IN_GLIST;
+
+	mutex_exit(&mac_srs->srs_lock);
+	rw_exit(&mac_srs_g_lock);
+}
+
+/* POLLING SETUP AND TEAR DOWN ROUTINES */
+
+/*
+ * mac_srs_client_poll_quiesce and mac_srs_client_poll_restart
+ *
+ * These routines are used to call back into the upper layer
+ * (primarily TCP squeue) to stop polling the soft rings or
+ * restart polling.
+ */
+void
+mac_srs_client_poll_quiesce(mac_client_impl_t *mcip,
+    mac_soft_ring_set_t *mac_srs)
+{
+	mac_soft_ring_t	*softring;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+	if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) {
+		ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS));
+		return;
+	}
+
+	for (softring = mac_srs->srs_soft_ring_head;
+	    softring != NULL; softring = softring->s_ring_next) {
+		if ((softring->s_ring_type & ST_RING_TCP) &&
+		    (softring->s_ring_rx_arg2 != NULL)) {
+			mcip->mci_resource_quiesce(mcip->mci_resource_arg,
+			    softring->s_ring_rx_arg2);
+		}
+	}
+}
+
+void
+mac_srs_client_poll_restart(mac_client_impl_t *mcip,
+    mac_soft_ring_set_t *mac_srs)
+{
+	mac_soft_ring_t	*softring;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+	if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) {
+		ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS));
+		return;
+	}
+
+	for (softring = mac_srs->srs_soft_ring_head;
+	    softring != NULL; softring = softring->s_ring_next) {
+		if ((softring->s_ring_type & ST_RING_TCP) &&
+		    (softring->s_ring_rx_arg2 != NULL)) {
+			mcip->mci_resource_restart(mcip->mci_resource_arg,
+			    softring->s_ring_rx_arg2);
+		}
+	}
+}
+
+/*
+ * Register the given SRS and associated soft rings with the consumer and
+ * enable the polling interface used by the consumer.(i.e IP) over this
+ * SRS and associated soft rings.
+ */
+void
+mac_srs_client_poll_enable(mac_client_impl_t *mcip,
+    mac_soft_ring_set_t *mac_srs)
+{
+	mac_rx_fifo_t		mrf;
+	mac_soft_ring_t		*softring;
+
+	ASSERT(mac_srs->srs_mcip == mcip);
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+	if (!(mcip->mci_state_flags & MCIS_CLIENT_POLL_CAPABLE))
+		return;
+
+	bzero(&mrf, sizeof (mac_rx_fifo_t));
+	mrf.mrf_type = MAC_RX_FIFO;
+
+	/*
+	 * A SRS is capable of acting as a soft ring for cases
+	 * where no fanout is needed. This is the case for userland
+	 * flows.
+	 */
+	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS)
+		return;
+
+	mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll;
+	mrf.mrf_intr_enable = (mac_intr_enable_t)mac_soft_ring_intr_enable;
+	mrf.mrf_intr_disable = (mac_intr_disable_t)mac_soft_ring_intr_disable;
+	mac_srs->srs_type |= SRST_CLIENT_POLL_ENABLED;
+
+	softring = mac_srs->srs_soft_ring_head;
+	while (softring != NULL) {
+		if (softring->s_ring_type & (ST_RING_TCP | ST_RING_UDP)) {
+			/*
+			 * TCP and UDP support DLS bypass. Squeue polling
+			 * support implies DLS bypass since the squeue poll
+			 * path does not have DLS processing.
+			 */
+			mac_soft_ring_dls_bypass(softring,
+			    mcip->mci_direct_rx_fn, mcip->mci_direct_rx_arg);
+		}
+		/*
+		 * Non-TCP protocols don't support squeues. Hence we don't
+		 * make any ring addition callbacks for non-TCP rings
+		 */
+		if (!(softring->s_ring_type & ST_RING_TCP)) {
+			softring->s_ring_rx_arg2 = NULL;
+			softring = softring->s_ring_next;
+			continue;
+		}
+		mrf.mrf_rx_arg = softring;
+		mrf.mrf_intr_handle = (mac_intr_handle_t)softring;
+		mrf.mrf_cpu_id = softring->s_ring_cpuid;
+		mrf.mrf_flow_priority = mac_srs->srs_pri;
+
+		softring->s_ring_rx_arg2 = mcip->mci_resource_add(
+		    mcip->mci_resource_arg, (mac_resource_t *)&mrf);
+
+		softring = softring->s_ring_next;
+	}
+}
+
+/*
+ * Unregister the given SRS and associated soft rings with the consumer and
+ * disable the polling interface used by the consumer.(i.e IP) over this
+ * SRS and associated soft rings.
+ */
+void
+mac_srs_client_poll_disable(mac_client_impl_t *mcip,
+    mac_soft_ring_set_t *mac_srs)
+{
+	mac_soft_ring_t		*softring;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+	/*
+	 * A SRS is capable of acting as a soft ring for cases
+	 * where no protocol fanout is needed. This is the case
+	 * for userland flows. Nothing to do here.
+	 */
+	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS)
+		return;
+
+	mutex_enter(&mac_srs->srs_lock);
+	if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) {
+		ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS));
+		mutex_exit(&mac_srs->srs_lock);
+		return;
+	}
+	mac_srs->srs_type &= ~(SRST_CLIENT_POLL_ENABLED | SRST_DLS_BYPASS);
+	mutex_exit(&mac_srs->srs_lock);
+
+	/*
+	 * DLS bypass is now disabled in the case of both TCP and UDP.
+	 * Reset the soft ring callbacks to the standard 'mac_rx_deliver'
+	 * callback. In addition, in the case of TCP, invoke IP's callback
+	 * for ring removal.
+	 */
+	for (softring = mac_srs->srs_soft_ring_head;
+	    softring != NULL; softring = softring->s_ring_next) {
+		if (!(softring->s_ring_type & (ST_RING_UDP | ST_RING_TCP)))
+			continue;
+
+		if ((softring->s_ring_type & ST_RING_TCP) &&
+		    softring->s_ring_rx_arg2 != NULL) {
+			mcip->mci_resource_remove(mcip->mci_resource_arg,
+			    softring->s_ring_rx_arg2);
+		}
+
+		mutex_enter(&softring->s_ring_lock);
+		while (softring->s_ring_state & S_RING_PROC) {
+			softring->s_ring_state |= S_RING_CLIENT_WAIT;
+			cv_wait(&softring->s_ring_client_cv,
+			    &softring->s_ring_lock);
+		}
+		softring->s_ring_state &= ~S_RING_CLIENT_WAIT;
+		softring->s_ring_rx_arg2 = NULL;
+		softring->s_ring_rx_func = mac_rx_deliver;
+		softring->s_ring_rx_arg1 = mcip;
+		mutex_exit(&softring->s_ring_lock);
+	}
+}
+
+/*
+ * Enable or disable poll capability of the SRS on the underlying Rx ring.
+ *
+ * There is a need to enable or disable the poll capability of an SRS over an
+ * Rx ring depending on the number of mac clients sharing the ring and also
+ * whether user flows are configured on it. However the poll state is actively
+ * manipulated by the SRS worker and poll threads and uncoordinated changes by
+ * yet another thread to the underlying capability can surprise them leading
+ * to assert failures. Instead we quiesce the SRS, make the changes and then
+ * restart the SRS.
+ */
+static void
+mac_srs_poll_state_change(mac_soft_ring_set_t *mac_srs,
+    boolean_t turn_off_poll_capab, mac_rx_func_t rx_func)
+{
+	boolean_t	need_restart = B_FALSE;
+	mac_srs_rx_t	*srs_rx = &mac_srs->srs_rx;
+	mac_ring_t	*ring;
+
+	if (!SRS_QUIESCED(mac_srs)) {
+		mac_rx_srs_quiesce(mac_srs, SRS_QUIESCE);
+		need_restart = B_TRUE;
+	}
+
+	ring = mac_srs->srs_ring;
+	if ((ring != NULL) &&
+	    (ring->mr_classify_type == MAC_HW_CLASSIFIER)) {
+		if (turn_off_poll_capab)
+			mac_srs->srs_state &= ~SRS_POLLING_CAPAB;
+		else
+			mac_srs->srs_state |= SRS_POLLING_CAPAB;
+	}
+	srs_rx->sr_lower_proc = rx_func;
+
+	if (need_restart)
+		mac_rx_srs_restart(mac_srs);
+}
+
+/* CPU RECONFIGURATION AND FANOUT COMPUTATION ROUTINES */
+
+/*
+ * Return the next CPU to be used to bind a MAC kernel thread.
+ */
+static processorid_t
+mac_next_bind_cpu(void)
+{
+	static processorid_t srs_curr_cpu = -1;
+	cpu_t *cp;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	srs_curr_cpu++;
+	cp = cpu_get(srs_curr_cpu);
+	if (cp == NULL || !cpu_is_online(cp))
+		srs_curr_cpu = srs_bind_cpu;
+
+	return (srs_curr_cpu);
+}
+
+/* ARGSUSED */
+static int
+mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg)
+{
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	switch (what) {
+	case CPU_CONFIG:
+	case CPU_ON:
+	case CPU_CPUPART_IN:
+		mac_walk_srs_and_bind(id);
+		break;
+
+	case CPU_UNCONFIG:
+	case CPU_OFF:
+	case CPU_CPUPART_OUT:
+		mac_walk_srs_and_unbind(id);
+		break;
+
+	default:
+		break;
+	}
+	return (0);
+}
+
+/*
+ * mac_compute_soft_ring_count():
+ *
+ * This routine computes the number of soft rings needed to handle incoming
+ * load given a flow_entry.
+ *
+ * The routine does the following:
+ * 1) soft rings will be created if mac_soft_ring_enable is set.
+ * 2) If the underlying link is a 10Gbps link, then soft rings will be
+ * created even if mac_soft_ring_enable is not set. The number of soft
+ * rings, so created,  will equal mac_rx_soft_ring_10gig_count.
+ * 3) On a sun4v platform (i.e., mac_soft_ring_enable is set), 2 times the
+ * mac_rx_soft_ring_10gig_count number of soft rings will be created for a
+ * 10Gbps link.
+ *
+ * If a bandwidth limit is specified, the number that gets computed is
+ * dependent upon CPU speed, the number of Rx rings configured, and
+ * the bandwidth limit.
+ * If more Rx rings are available, less number of soft rings is needed.
+ *
+ * mac_use_bw_heuristic is another "hidden" variable that can be used to
+ * override the default use of soft ring count computation. Depending upon
+ * the usefulness of it, mac_use_bw_heuristic can later be made into a
+ * data-link property or removed altogether.
+ *
+ * TODO: Cleanup and tighten some of the assumptions.
+ */
+boolean_t mac_use_bw_heuristic = B_TRUE;
+static int
+mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt)
+{
+	uint64_t cpu_speed, bw = 0;
+	int srings = 0;
+	boolean_t bw_enabled = B_FALSE;
+
+	ASSERT(!(flent->fe_type & FLOW_USER));
+	if (flent->fe_resource_props.mrp_mask & MRP_MAXBW &&
+	    mac_use_bw_heuristic) {
+		/* bandwidth enabled */
+		bw_enabled = B_TRUE;
+		bw = flent->fe_resource_props.mrp_maxbw;
+	}
+	if (!bw_enabled) {
+		/* No bandwidth enabled */
+		if (mac_soft_ring_enable)
+			srings = mac_rx_soft_ring_count;
+
+		/* Is this a 10Gig link? */
+		flent->fe_nic_speed = mac_client_stat_get(flent->fe_mcip,
+		    MAC_STAT_IFSPEED);
+		/* convert to Mbps */
+		if (((flent->fe_nic_speed)/1000000) > 1000 &&
+		    mac_rx_soft_ring_10gig_count > 0) {
+			/* This is a 10Gig link */
+			srings = mac_rx_soft_ring_10gig_count;
+			/*
+			 * Use 2 times mac_rx_soft_ring_10gig_count for
+			 * sun4v systems.
+			 */
+			if (mac_soft_ring_enable)
+				srings = srings * 2;
+		}
+	} else {
+		/*
+		 * Soft ring computation using CPU speed and specified
+		 * bandwidth limit.
+		 */
+		/* Assumption: all CPUs have the same frequency */
+		cpu_speed = (uint64_t)CPU->cpu_type_info.pi_clock;
+
+		/* cpu_speed is in MHz; make bw in units of Mbps.  */
+		bw = bw/1000000;
+
+		if (bw >= 1000) {
+			/*
+			 * bw is greater than or equal to 1Gbps.
+			 * The number of soft rings required is a function
+			 * of bandwidth and CPU speed. To keep this simple,
+			 * let's use this rule: 1GHz CPU can handle 1Gbps.
+			 * If bw is less than 1 Gbps, then there is no need
+			 * for soft rings. Assumption is that CPU speeds
+			 * (on modern systems) are at least 1GHz.
+			 */
+			srings = bw/cpu_speed;
+			if (srings <= 1 && mac_soft_ring_enable) {
+				/*
+				 * Give at least 2 soft rings
+				 * for sun4v systems
+				 */
+				srings = 2;
+			}
+		}
+	}
+	/*
+	 * If the flent has multiple Rx SRSs, then each SRS need not
+	 * have that many soft rings on top of it. The number of
+	 * soft rings for each Rx SRS is found by dividing srings by
+	 * rx_srs_cnt.
+	 */
+	if (rx_srs_cnt > 1) {
+		int remainder;
+
+		remainder = srings%rx_srs_cnt;
+		srings = srings/rx_srs_cnt;
+		if (remainder != 0)
+			srings++;
+		/*
+		 * Fanning out to 1 soft ring is not very useful.
+		 * Set it as well to 0 and mac_srs_fanout_init()
+		 * will take care of creating a single soft ring
+		 * for proto fanout.
+		 */
+		if (srings == 1)
+			srings = 0;
+	}
+	/* Do some more massaging */
+	srings = min(srings, ncpus);
+	srings = min(srings, MAX_SR_FANOUT);
+	return (srings);
+}
+
+/*
+ * Assignment of user specified CPUs to a link.
+ *
+ * Minimum CPUs required to get an optimal assignmet:
+ * For each Rx SRS, atleast two CPUs are needed if mac_latency_optimize
+ * flag is set -- one for polling, one for fanout soft ring.
+ * If mac_latency_optimize is not set, then 3 CPUs are needed -- one
+ * for polling, one for SRS worker thread and one for fanout soft ring.
+ *
+ * The CPUs needed for Tx side is equal to the number of Tx rings
+ * the link is using.
+ *
+ * mac_flow_user_cpu_init() categorizes the CPU assignment depending
+ * upon the number of CPUs in 3 different buckets.
+ *
+ * In the first bucket, the most optimal case is handled. The user has
+ * passed enough number of CPUs and every thread gets its own CPU.
+ *
+ * The second and third are the sub-optimal cases. Enough CPUs are not
+ * available.
+ *
+ * The second bucket handles the case where atleast one distinct CPU is
+ * is available for each of the Rx rings (Rx SRSes) and Tx rings (Tx
+ * SRS or soft rings).
+ *
+ * In the third case (worst case scenario), specified CPU count is less
+ * than the Rx rings configured for the link. In this case, we round
+ * robin the CPUs among the Rx SRSes and Tx SRS/soft rings.
+ */
+static void
+mac_flow_user_cpu_init(flow_entry_t *flent, mac_resource_props_t *mrp)
+{
+	mac_soft_ring_set_t *rx_srs, *tx_srs;
+	int i, srs_cnt;
+	mac_cpus_t *srs_cpu;
+	int no_of_cpus, cpu_cnt;
+	int rx_srs_cnt, reqd_rx_cpu_cnt;
+	int fanout_cpu_cnt, reqd_tx_cpu_cnt;
+	int reqd_poll_worker_cnt, fanout_cnt_per_srs;
+
+	ASSERT(mrp->mrp_fanout_mode == MCM_CPUS);
+	/*
+	 * The check for nbc_ncpus to be within limits for
+	 * the user specified case was done earlier and if
+	 * not within limits, an error would have been
+	 * returned to the user.
+	 */
+	ASSERT(mrp->mrp_ncpus > 0 && mrp->mrp_ncpus <= MAX_SR_FANOUT);
+
+	no_of_cpus = mrp->mrp_ncpus;
+
+	if (mrp->mrp_intr_cpu != -1) {
+		/*
+		 * interrupt has been re-targetted. Poll
+		 * thread needs to be bound to interrupt
+		 * CPU. Presently only fixed interrupts
+		 * are re-targetted, MSI-x aren't.
+		 *
+		 * Find where in the list is the intr
+		 * CPU and swap it with the first one.
+		 * We will be using the first CPU in the
+		 * list for poll.
+		 */
+		for (i = 0; i < no_of_cpus; i++) {
+			if (mrp->mrp_cpu[i] == mrp->mrp_intr_cpu)
+				break;
+		}
+		mrp->mrp_cpu[i] = mrp->mrp_cpu[0];
+		mrp->mrp_cpu[0] = mrp->mrp_intr_cpu;
+	}
+
+	/*
+	 * Requirements:
+	 * The number of CPUs that each Rx ring needs is dependent
+	 * upon mac_latency_optimize flag.
+	 * 1) If set, atleast 2 CPUs are needed -- one for
+	 * polling, one for fanout soft ring.
+	 * 2) If not set, then atleast 3 CPUs are needed -- one
+	 * for polling, one for srs worker thread, and one for
+	 * fanout soft ring.
+	 */
+	rx_srs_cnt = (flent->fe_rx_srs_cnt > 1) ?
+	    (flent->fe_rx_srs_cnt - 1) : flent->fe_rx_srs_cnt;
+	reqd_rx_cpu_cnt = mac_latency_optimize ?
+	    (rx_srs_cnt * 2) : (rx_srs_cnt * 3);
+
+	/* How many CPUs are needed for Tx side? */
+	tx_srs = flent->fe_tx_srs;
+	reqd_tx_cpu_cnt = TX_MULTI_RING_MODE(tx_srs) ?
+	    tx_srs->srs_oth_ring_count : 1;
+
+	/* CPUs needed for Rx SRSes poll and worker threads */
+	reqd_poll_worker_cnt = mac_latency_optimize ?
+	    rx_srs_cnt : rx_srs_cnt * 2;
+
+	/* Has the user provided enough CPUs? */
+	if (no_of_cpus >= (reqd_rx_cpu_cnt + reqd_tx_cpu_cnt)) {
+		/*
+		 * Best case scenario. There is enough CPUs. All
+		 * Rx rings will get their own set of CPUs plus
+		 * Tx soft rings will get their own.
+		 */
+		/*
+		 * fanout_cpu_cnt is the number of CPUs available
+		 * for Rx side fanout soft rings.
+		 */
+		fanout_cpu_cnt = no_of_cpus -
+		    reqd_poll_worker_cnt - reqd_tx_cpu_cnt;
+
+		/*
+		 * Divide fanout_cpu_cnt by rx_srs_cnt to find
+		 * out how many fanout soft rings each Rx SRS
+		 * can have.
+		 */
+		fanout_cnt_per_srs = fanout_cpu_cnt/rx_srs_cnt;
+
+		/* Do the assignment for the default Rx ring */
+		cpu_cnt = 0;
+		rx_srs = flent->fe_rx_srs[0];
+		ASSERT(rx_srs->srs_ring == NULL);
+		if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT)
+			rx_srs->srs_fanout_state = SRS_FANOUT_REINIT;
+		srs_cpu = &rx_srs->srs_cpu;
+		srs_cpu->mc_ncpus = no_of_cpus;
+		bcopy(mrp->mrp_cpu,
+		    srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus));
+		srs_cpu->mc_fanout_cnt = fanout_cnt_per_srs;
+		srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt++];
+		srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu;
+		srs_cpu->mc_workerid = srs_cpu->mc_pollid;
+		if (!mac_latency_optimize)
+			srs_cpu->mc_workerid = mrp->mrp_cpu[cpu_cnt++];
+		for (i = 0; i < fanout_cnt_per_srs; i++)
+			srs_cpu->mc_fanout_cpus[i] = mrp->mrp_cpu[cpu_cnt++];
+
+		/* Do the assignment for h/w Rx SRSes */
+		if (flent->fe_rx_srs_cnt > 1) {
+			cpu_cnt = 0;
+			for (srs_cnt = 1;
+			    srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
+				rx_srs = flent->fe_rx_srs[srs_cnt];
+				ASSERT(rx_srs->srs_ring != NULL);
+				if (rx_srs->srs_fanout_state ==
+				    SRS_FANOUT_INIT) {
+					rx_srs->srs_fanout_state =
+					    SRS_FANOUT_REINIT;
+				}
+				srs_cpu = &rx_srs->srs_cpu;
+				srs_cpu->mc_ncpus = no_of_cpus;
+				bcopy(mrp->mrp_cpu, srs_cpu->mc_cpus,
+				    sizeof (srs_cpu->mc_cpus));
+				srs_cpu->mc_fanout_cnt = fanout_cnt_per_srs;
+				/* The first CPU in the list is the intr CPU */
+				srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt++];
+				srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu;
+				srs_cpu->mc_workerid = srs_cpu->mc_pollid;
+				if (!mac_latency_optimize) {
+					srs_cpu->mc_workerid =
+					    mrp->mrp_cpu[cpu_cnt++];
+				}
+				for (i = 0; i < fanout_cnt_per_srs; i++) {
+					srs_cpu->mc_fanout_cpus[i] =
+					    mrp->mrp_cpu[cpu_cnt++];
+				}
+				ASSERT(cpu_cnt <= no_of_cpus);
+			}
+		}
+		return;
+	}
+
+	/*
+	 * Sub-optimal case.
+	 * We have the following information:
+	 * no_of_cpus - no. of cpus that user passed.
+	 * rx_srs_cnt - no. of rx rings.
+	 * reqd_rx_cpu_cnt = mac_latency_optimize?rx_srs_cnt*2:rx_srs_cnt*3
+	 * reqd_tx_cpu_cnt - no. of cpus reqd. for Tx side.
+	 * reqd_poll_worker_cnt = mac_latency_optimize?rx_srs_cnt:rx_srs_cnt*2
+	 */
+	/*
+	 * If we bind the Rx fanout soft rings to the same CPUs
+	 * as poll/worker, would that be enough?
+	 */
+	if (no_of_cpus >= (rx_srs_cnt + reqd_tx_cpu_cnt)) {
+		boolean_t worker_assign = B_FALSE;
+
+		/*
+		 * If mac_latency_optimize is not set, are there
+		 * enough CPUs to assign a CPU for worker also?
+		 */
+		if (no_of_cpus >= (reqd_poll_worker_cnt + reqd_tx_cpu_cnt))
+			worker_assign = B_TRUE;
+		/*
+		 * Zero'th Rx SRS is the default Rx ring. It is not
+		 * associated with h/w Rx ring.
+		 */
+		rx_srs = flent->fe_rx_srs[0];
+		ASSERT(rx_srs->srs_ring == NULL);
+		if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT)
+			rx_srs->srs_fanout_state = SRS_FANOUT_REINIT;
+		cpu_cnt = 0;
+		srs_cpu = &rx_srs->srs_cpu;
+		srs_cpu->mc_ncpus = no_of_cpus;
+		bcopy(mrp->mrp_cpu,
+		    srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus));
+		srs_cpu->mc_fanout_cnt = 1;
+		srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt++];
+		srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu;
+		srs_cpu->mc_workerid = srs_cpu->mc_pollid;
+		if (!mac_latency_optimize && worker_assign)
+			srs_cpu->mc_workerid = mrp->mrp_cpu[cpu_cnt++];
+		srs_cpu->mc_fanout_cpus[0] = mrp->mrp_cpu[cpu_cnt];
+
+		/* Do CPU bindings for SRSes having h/w Rx rings */
+		if (flent->fe_rx_srs_cnt > 1) {
+			cpu_cnt = 0;
+			for (srs_cnt = 1;
+			    srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
+				rx_srs = flent->fe_rx_srs[srs_cnt];
+				ASSERT(rx_srs->srs_ring != NULL);
+				if (rx_srs->srs_fanout_state ==
+				    SRS_FANOUT_INIT) {
+					rx_srs->srs_fanout_state =
+					    SRS_FANOUT_REINIT;
+				}
+				srs_cpu = &rx_srs->srs_cpu;
+				srs_cpu->mc_ncpus = no_of_cpus;
+				bcopy(mrp->mrp_cpu, srs_cpu->mc_cpus,
+				    sizeof (srs_cpu->mc_cpus));
+				srs_cpu->mc_pollid =
+				    mrp->mrp_cpu[cpu_cnt];
+				srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu;
+				srs_cpu->mc_workerid = srs_cpu->mc_pollid;
+				if (!mac_latency_optimize && worker_assign) {
+					srs_cpu->mc_workerid =
+					    mrp->mrp_cpu[++cpu_cnt];
+				}
+				srs_cpu->mc_fanout_cnt = 1;
+				srs_cpu->mc_fanout_cpus[0] =
+				    mrp->mrp_cpu[cpu_cnt];
+				cpu_cnt++;
+				ASSERT(cpu_cnt <= no_of_cpus);
+			}
+		}
+		return;
+	}
+
+	/*
+	 * Real sub-optimal case. Not enough CPUs for poll and
+	 * Tx soft rings. Do a round robin assignment where
+	 * each Rx SRS will get the same CPU for poll, worker
+	 * and fanout soft ring.
+	 */
+	cpu_cnt = 0;
+	for (srs_cnt = 0; srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
+		rx_srs = flent->fe_rx_srs[srs_cnt];
+		srs_cpu = &rx_srs->srs_cpu;
+		if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT)
+			rx_srs->srs_fanout_state = SRS_FANOUT_REINIT;
+		srs_cpu->mc_ncpus = no_of_cpus;
+		bcopy(mrp->mrp_cpu,
+		    srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus));
+		srs_cpu->mc_fanout_cnt = 1;
+		srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt];
+		srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu;
+		srs_cpu->mc_workerid = mrp->mrp_cpu[cpu_cnt];
+		srs_cpu->mc_fanout_cpus[0] = mrp->mrp_cpu[cpu_cnt];
+		if (++cpu_cnt >= no_of_cpus)
+			cpu_cnt = 0;
+	}
+}
+
+/*
+ * mac_flow_cpu_init():
+ *
+ * Each SRS has a mac_cpu_t structure, srs_cpu. This routine fills in
+ * the CPU binding information in srs_cpu for all Rx SRSes associated
+ * with a flent.
+ */
+static void
+mac_flow_cpu_init(flow_entry_t *flent, mac_resource_props_t *mrp)
+{
+	mac_soft_ring_set_t *rx_srs;
+	processorid_t cpuid;
+	int j, srs_cnt, soft_ring_cnt = 0;
+	mac_cpus_t *srs_cpu;
+
+	if (mrp->mrp_mask & MRP_CPUS_USERSPEC) {
+		mac_flow_user_cpu_init(flent, mrp);
+	} else {
+		/*
+		 * Compute the number of soft rings needed on top for each Rx
+		 * SRS. "rx_srs_cnt-1" indicates the number of Rx SRS
+		 * associated with h/w Rx rings. Soft ring count needed for
+		 * each h/w Rx SRS is computed and the same is applied to
+		 * software classified Rx SRS. The first Rx SRS in fe_rx_srs[]
+		 * is the software classified Rx SRS.
+		 */
+		soft_ring_cnt = mac_compute_soft_ring_count(flent,
+		    flent->fe_rx_srs_cnt - 1);
+		if (soft_ring_cnt == 0) {
+			/*
+			 * Even when soft_ring_cnt is 0, we still need
+			 * to create a soft ring for TCP, UDP and
+			 * OTHER. So set it to 1.
+			 */
+			soft_ring_cnt = 1;
+		}
+		for (srs_cnt = 0; srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) {
+			rx_srs = flent->fe_rx_srs[srs_cnt];
+			srs_cpu = &rx_srs->srs_cpu;
+			if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT) {
+				if (soft_ring_cnt == srs_cpu->mc_fanout_cnt)
+					continue;
+				rx_srs->srs_fanout_state = SRS_FANOUT_REINIT;
+			}
+			srs_cpu->mc_ncpus = soft_ring_cnt;
+			srs_cpu->mc_fanout_cnt = soft_ring_cnt;
+			mutex_enter(&cpu_lock);
+			for (j = 0; j < soft_ring_cnt; j++) {
+				cpuid = mac_next_bind_cpu();
+				srs_cpu->mc_cpus[j] = cpuid;
+				srs_cpu->mc_fanout_cpus[j] = cpuid;
+			}
+			cpuid = mac_next_bind_cpu();
+			srs_cpu->mc_pollid = cpuid;
+			/* increment ncpus to account for polling cpu */
+			srs_cpu->mc_ncpus++;
+			srs_cpu->mc_cpus[j++] = cpuid;
+			if (!mac_latency_optimize) {
+				cpuid = mac_next_bind_cpu();
+				srs_cpu->mc_ncpus++;
+				srs_cpu->mc_cpus[j++] = cpuid;
+			}
+			srs_cpu->mc_workerid = cpuid;
+			mutex_exit(&cpu_lock);
+		}
+	}
+}
+
+/*
+ * DATAPATH SETUP ROUTINES
+ * (setup SRS and set/update FANOUT, B/W and PRIORITY)
+ */
+
+static void
+mac_srs_fanout_list_alloc(mac_soft_ring_set_t *mac_srs)
+{
+	mac_srs->srs_tcp_soft_rings = (mac_soft_ring_t **)
+	    kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT, KM_SLEEP);
+	mac_srs->srs_udp_soft_rings = (mac_soft_ring_t **)
+	    kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT, KM_SLEEP);
+	mac_srs->srs_oth_soft_rings = (mac_soft_ring_t **)
+	    kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT, KM_SLEEP);
+}
+
+static void
+mac_srs_worker_bind(mac_soft_ring_set_t *mac_srs, processorid_t cpuid)
+{
+	cpu_t *cp;
+	boolean_t clear = B_FALSE;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	if (!mac_srs_thread_bind)
+		return;
+
+	cp = cpu_get(cpuid);
+	if (cp == NULL || !cpu_is_online(cp))
+		return;
+
+	mutex_enter(&mac_srs->srs_lock);
+	mac_srs->srs_state |= SRS_WORKER_BOUND;
+	if (mac_srs->srs_worker_cpuid != -1)
+		clear = B_TRUE;
+	mac_srs->srs_worker_cpuid = cpuid;
+	mutex_exit(&mac_srs->srs_lock);
+
+	if (clear)
+		thread_affinity_clear(mac_srs->srs_worker);
+
+	thread_affinity_set(mac_srs->srs_worker, cpuid);
+	DTRACE_PROBE1(worker__CPU, processorid_t, cpuid);
+}
+
+static void
+mac_srs_poll_bind(mac_soft_ring_set_t *mac_srs, processorid_t cpuid)
+{
+	cpu_t *cp;
+	boolean_t clear = B_FALSE;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	if (!mac_srs_thread_bind || mac_srs->srs_poll_thr == NULL)
+		return;
+
+	cp = cpu_get(cpuid);
+	if (cp == NULL || !cpu_is_online(cp))
+		return;
+
+	mutex_enter(&mac_srs->srs_lock);
+	mac_srs->srs_state |= SRS_POLL_BOUND;
+	if (mac_srs->srs_poll_cpuid != -1)
+		clear = B_TRUE;
+	mac_srs->srs_poll_cpuid = cpuid;
+	mutex_exit(&mac_srs->srs_lock);
+
+	if (clear)
+		thread_affinity_clear(mac_srs->srs_poll_thr);
+
+	thread_affinity_set(mac_srs->srs_poll_thr, cpuid);
+	DTRACE_PROBE1(poll__CPU, processorid_t, cpuid);
+}
+
+/*
+ * When a CPU comes back online, bind the MAC kernel threads which
+ * were previously bound to that CPU, and had to be unbound because
+ * the CPU was going away.
+ *
+ * These functions are called with cpu_lock held and hence we can't
+ * cv_wait to grab the mac perimeter. Since these functions walk the soft
+ * ring list of an SRS without being in the perimeter, the list itself
+ * is protected by the SRS lock.
+ */
+static void
+mac_walk_srs_and_bind(int cpuid)
+{
+	mac_soft_ring_set_t *mac_srs;
+	mac_soft_ring_t *soft_ring;
+
+	rw_enter(&mac_srs_g_lock, RW_READER);
+
+	if ((mac_srs = mac_srs_g_list) == NULL)
+		goto done;
+
+	for (; mac_srs != NULL; mac_srs = mac_srs->srs_next) {
+		if (mac_srs->srs_worker_cpuid == -1 &&
+		    mac_srs->srs_worker_cpuid_save == cpuid) {
+			mac_srs->srs_worker_cpuid_save = -1;
+			mac_srs_worker_bind(mac_srs, cpuid);
+		}
+
+		if (!(mac_srs->srs_type & SRST_TX)) {
+			if (mac_srs->srs_poll_cpuid == -1 &&
+			    mac_srs->srs_poll_cpuid_save == cpuid) {
+				mac_srs->srs_poll_cpuid_save = -1;
+				mac_srs_poll_bind(mac_srs, cpuid);
+			}
+		}
+
+		/* Next tackle the soft rings associated with the srs */
+		mutex_enter(&mac_srs->srs_lock);
+		for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL;
+		    soft_ring = soft_ring->s_ring_next) {
+			if (soft_ring->s_ring_cpuid == -1 &&
+			    soft_ring->s_ring_cpuid_save == cpuid) {
+				soft_ring->s_ring_cpuid_save = -1;
+				(void) mac_soft_ring_bind(soft_ring, cpuid);
+			}
+		}
+		mutex_exit(&mac_srs->srs_lock);
+	}
+done:
+	rw_exit(&mac_srs_g_lock);
+}
+
+/*
+ * Change the priority of the SRS's poll and worker thread. Additionally,
+ * update the priority of the worker threads for the SRS's soft rings.
+ * Need to modify any associated squeue threads.
+ */
+void
+mac_update_srs_priority(mac_soft_ring_set_t *mac_srs, pri_t prival)
+{
+	mac_soft_ring_t		*ringp;
+
+	mac_srs->srs_pri = prival;
+	thread_lock(mac_srs->srs_worker);
+	(void) thread_change_pri(mac_srs->srs_worker, mac_srs->srs_pri, 0);
+	thread_unlock(mac_srs->srs_worker);
+	if (mac_srs->srs_poll_thr != NULL) {
+		thread_lock(mac_srs->srs_poll_thr);
+		(void) thread_change_pri(mac_srs->srs_poll_thr,
+		    mac_srs->srs_pri, 0);
+		thread_unlock(mac_srs->srs_poll_thr);
+	}
+	if ((ringp = mac_srs->srs_soft_ring_head) == NULL)
+		return;
+	while (ringp != mac_srs->srs_soft_ring_tail) {
+		thread_lock(ringp->s_ring_worker);
+		(void) thread_change_pri(ringp->s_ring_worker,
+		    mac_srs->srs_pri, 0);
+		thread_unlock(ringp->s_ring_worker);
+		ringp = ringp->s_ring_next;
+	}
+	ASSERT(ringp == mac_srs->srs_soft_ring_tail);
+	thread_lock(ringp->s_ring_worker);
+	(void) thread_change_pri(ringp->s_ring_worker, mac_srs->srs_pri, 0);
+	thread_unlock(ringp->s_ring_worker);
+}
+
+/*
+ * Change the receive bandwidth limit.
+ */
+static void
+mac_rx_srs_update_bwlimit(mac_soft_ring_set_t *srs, mac_resource_props_t *mrp)
+{
+	mac_soft_ring_t		*softring;
+
+	mutex_enter(&srs->srs_lock);
+	mutex_enter(&srs->srs_bw->mac_bw_lock);
+
+	if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
+		/* Reset bandwidth limit */
+		if (srs->srs_type & SRST_BW_CONTROL) {
+			softring = srs->srs_soft_ring_head;
+			while (softring != NULL) {
+				softring->s_ring_type &= ~ST_RING_BW_CTL;
+				softring = softring->s_ring_next;
+			}
+			srs->srs_type &= ~SRST_BW_CONTROL;
+			srs->srs_drain_func = mac_rx_srs_drain;
+		}
+	} else {
+		/* Set/Modify bandwidth limit */
+		srs->srs_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw);
+		/*
+		 * Give twice the queuing capability before
+		 * dropping packets. The unit is bytes/tick.
+		 */
+		srs->srs_bw->mac_bw_drop_threshold =
+		    srs->srs_bw->mac_bw_limit << 1;
+		if (!(srs->srs_type & SRST_BW_CONTROL)) {
+			softring = srs->srs_soft_ring_head;
+			while (softring != NULL) {
+				softring->s_ring_type |= ST_RING_BW_CTL;
+				softring = softring->s_ring_next;
+			}
+			srs->srs_type |= SRST_BW_CONTROL;
+			srs->srs_drain_func = mac_rx_srs_drain_bw;
+		}
+	}
+done:
+	mutex_exit(&srs->srs_bw->mac_bw_lock);
+	mutex_exit(&srs->srs_lock);
+}
+
+/* Change the transmit bandwidth limit */
+static void
+mac_tx_srs_update_bwlimit(mac_soft_ring_set_t *srs, mac_resource_props_t *mrp)
+{
+	mac_srs_tx_t	*srs_tx = &srs->srs_tx;
+	uint32_t	tx_mode;
+	mac_impl_t *mip = srs->srs_mcip->mci_mip;
+
+	mutex_enter(&srs->srs_lock);
+	mutex_enter(&srs->srs_bw->mac_bw_lock);
+
+	tx_mode = srs_tx->st_mode;
+
+	if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
+		/* Reset bandwidth limit */
+		if (tx_mode == SRS_TX_BW) {
+			if (mac_tx_serialize ||
+			    (mip->mi_v12n_level & MAC_VIRT_SERIALIZE)) {
+				srs_tx->st_mode = SRS_TX_SERIALIZE;
+			} else {
+				srs_tx->st_mode = SRS_TX_DEFAULT;
+			}
+		} else if (tx_mode == SRS_TX_BW_FANOUT) {
+			srs_tx->st_mode = SRS_TX_FANOUT;
+		}
+		srs->srs_type &= ~SRST_BW_CONTROL;
+	} else {
+		/* Set/Modify bandwidth limit */
+		srs->srs_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw);
+		/*
+		 * Give twice the queuing capability before
+		 * dropping packets. The unit is bytes/tick.
+		 */
+		srs->srs_bw->mac_bw_drop_threshold =
+		    srs->srs_bw->mac_bw_limit << 1;
+		srs->srs_type |= SRST_BW_CONTROL;
+		if (tx_mode != SRS_TX_BW &&
+		    tx_mode != SRS_TX_BW_FANOUT) {
+			if (tx_mode == SRS_TX_SERIALIZE ||
+			    tx_mode == SRS_TX_DEFAULT) {
+				srs_tx->st_mode = SRS_TX_BW;
+			} else if (tx_mode == SRS_TX_FANOUT) {
+				srs_tx->st_mode = SRS_TX_BW_FANOUT;
+			} else {
+				ASSERT(0);
+			}
+		}
+	}
+done:
+	srs_tx->st_func = mac_tx_get_func(srs_tx->st_mode);
+	mutex_exit(&srs->srs_bw->mac_bw_lock);
+	mutex_exit(&srs->srs_lock);
+}
+
+/*
+ * The uber function that deals with any update to bandwidth limits.
+ */
+void
+mac_srs_update_bwlimit(flow_entry_t *flent, mac_resource_props_t *mrp)
+{
+	int			count;
+
+	for (count = 0; count < flent->fe_rx_srs_cnt; count++)
+		mac_rx_srs_update_bwlimit(flent->fe_rx_srs[count], mrp);
+	mac_tx_srs_update_bwlimit(flent->fe_tx_srs, mrp);
+}
+
+void
+mac_srs_change_upcall(void *arg, mac_direct_rx_t rx_func, void *rx_arg1)
+{
+	mac_soft_ring_set_t	*mac_srs = arg;
+	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
+	mac_soft_ring_t		*softring;
+
+	mutex_enter(&mac_srs->srs_lock);
+	ASSERT((mac_srs->srs_type & SRST_TX) == 0);
+	srs_rx->sr_func = rx_func;
+	srs_rx->sr_arg1 = rx_arg1;
+
+	softring = mac_srs->srs_soft_ring_head;
+	while (softring != NULL) {
+		mutex_enter(&softring->s_ring_lock);
+		softring->s_ring_rx_func = rx_func;
+		softring->s_ring_rx_arg1 = rx_arg1;
+		mutex_exit(&softring->s_ring_lock);
+		softring = softring->s_ring_next;
+	}
+
+	mutex_exit(&mac_srs->srs_lock);
+}
+
+/*
+ * When the first sub-flow is added to a link, we disable polling on the
+ * link and also modify the entry point to mac_rx_srs_subflow_process.
+ * (polling is disabled because with the subflow added, accounting
+ * for polling needs additional logic, it is assumed that when a subflow is
+ * added, we can take some hit as a result of disabling polling rather than
+ * adding more complexity - if this becomes a perf. issue we need to
+ * re-rvaluate this logic).  When the last subflow is removed, we turn back
+ * polling and also reset the entry point to mac_rx_srs_process.
+ *
+ * In the future if there are multiple SRS, we can simply
+ * take one and give it to the flow rather than disabling polling and
+ * resetting the entry point.
+ */
+void
+mac_client_update_classifier(mac_client_impl_t *mcip, boolean_t enable)
+{
+	flow_entry_t		*flent = mcip->mci_flent;
+	int			i;
+	mac_impl_t		*mip = mcip->mci_mip;
+	mac_rx_func_t		rx_func;
+	uint_t			rx_srs_cnt;
+	boolean_t		enable_classifier;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	enable_classifier = !FLOW_TAB_EMPTY(mcip->mci_subflow_tab) && enable;
+
+	rx_func = enable_classifier ? mac_rx_srs_subflow_process :
+	    mac_rx_srs_process;
+
+	/*
+	 * If receive function has already been configured correctly for
+	 * current subflow configuration, do nothing.
+	 */
+	if (flent->fe_cb_fn == (flow_fn_t)rx_func)
+		return;
+
+	rx_srs_cnt = flent->fe_rx_srs_cnt;
+	for (i = 0; i < rx_srs_cnt; i++) {
+		ASSERT(flent->fe_rx_srs[i] != NULL);
+		mac_srs_poll_state_change(flent->fe_rx_srs[i],
+		    enable_classifier, rx_func);
+	}
+
+	/*
+	 * Change the S/W classifier so that we can land in the
+	 * correct processing function with correct argument.
+	 * If all subflows have been removed we can revert to
+	 * mac_rx_srsprocess, else we need mac_rx_srs_subflow_process.
+	 */
+	mutex_enter(&flent->fe_lock);
+	flent->fe_cb_fn = (flow_fn_t)rx_func;
+	flent->fe_cb_arg1 = (void *)mip;
+	flent->fe_cb_arg2 = flent->fe_rx_srs[0];
+	mutex_exit(&flent->fe_lock);
+}
+
+static void
+mac_srs_update_fanout_list(mac_soft_ring_set_t *mac_srs)
+{
+	int		tcp_count = 0;
+	int		udp_count = 0;
+	int		oth_count = 0;
+	mac_soft_ring_t *softring;
+
+	softring = mac_srs->srs_soft_ring_head;
+	if (softring == NULL) {
+		ASSERT(mac_srs->srs_soft_ring_count == 0);
+		mac_srs->srs_tcp_ring_count = 0;
+		mac_srs->srs_udp_ring_count = 0;
+		mac_srs->srs_oth_ring_count = 0;
+		return;
+	}
+
+	softring = mac_srs->srs_soft_ring_head;
+	tcp_count = udp_count = oth_count = 0;
+
+	while (softring != NULL) {
+		if (softring->s_ring_type & ST_RING_TCP)
+			mac_srs->srs_tcp_soft_rings[tcp_count++] = softring;
+		else if (softring->s_ring_type & ST_RING_UDP)
+			mac_srs->srs_udp_soft_rings[udp_count++] = softring;
+		else
+			mac_srs->srs_oth_soft_rings[oth_count++] = softring;
+		softring = softring->s_ring_next;
+	}
+
+	ASSERT(mac_srs->srs_soft_ring_count ==
+	    (tcp_count + udp_count + oth_count));
+
+	mac_srs->srs_tcp_ring_count = tcp_count;
+	mac_srs->srs_udp_ring_count = udp_count;
+	mac_srs->srs_oth_ring_count = oth_count;
+}
+
+void
+mac_srs_create_proto_softrings(int id, void *flent, uint16_t type,
+    pri_t pri, mac_client_impl_t *mcip, mac_soft_ring_set_t *mac_srs,
+    processorid_t cpuid, mac_direct_rx_t rx_func, void *x_arg1,
+    mac_resource_handle_t x_arg2, boolean_t set_bypass)
+{
+	mac_soft_ring_t	*softring;
+	mac_rx_fifo_t	mrf;
+
+	bzero(&mrf, sizeof (mac_rx_fifo_t));
+	mrf.mrf_type = MAC_RX_FIFO;
+	mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll;
+	mrf.mrf_intr_enable =
+	    (mac_intr_enable_t)mac_soft_ring_intr_enable;
+	mrf.mrf_intr_disable =
+	    (mac_intr_disable_t)mac_soft_ring_intr_disable;
+	mrf.mrf_flow_priority = pri;
+
+	softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait,
+	    (void *)flent, (type|ST_RING_TCP), pri, mcip, mac_srs,
+	    cpuid, rx_func, x_arg1, x_arg2);
+	softring->s_ring_rx_arg2 = NULL;
+
+	/*
+	 * TCP and UDP support DLS bypass. In addition TCP
+	 * squeue can also poll their corresponding soft rings.
+	 */
+	if (set_bypass && (mcip->mci_resource_arg != NULL)) {
+		mac_soft_ring_dls_bypass(softring,
+		    mcip->mci_direct_rx_fn,
+		    mcip->mci_direct_rx_arg);
+
+		mrf.mrf_rx_arg = softring;
+		mrf.mrf_intr_handle = (mac_intr_handle_t)softring;
+
+		/*
+		 * Make a call in IP to get a TCP squeue assigned to
+		 * this softring to maintain full CPU locality through
+		 * the stack and allow the squeue to be able to poll
+		 * the softring so the flow control can be pushed
+		 * all the way to H/W.
+		 */
+		softring->s_ring_rx_arg2 =
+		    mcip->mci_resource_add((void *)mcip->mci_resource_arg,
+		    (mac_resource_t *)&mrf);
+	}
+
+	/*
+	 * Non-TCP protocols don't support squeues. Hence we
+	 * don't make any ring addition callbacks for non-TCP
+	 * rings. Now create the UDP softring and allow it to
+	 * bypass the DLS layer.
+	 */
+	softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait,
+	    (void *)flent, (type|ST_RING_UDP), pri, mcip, mac_srs,
+	    cpuid, rx_func, x_arg1, x_arg2);
+	softring->s_ring_rx_arg2 = NULL;
+
+	if (set_bypass && (mcip->mci_resource_arg != NULL)) {
+		mac_soft_ring_dls_bypass(softring,
+		    mcip->mci_direct_rx_fn,
+		    mcip->mci_direct_rx_arg);
+	}
+
+	/* Create the Oth softrings which has to go through the DLS */
+	softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait,
+	    (void *)flent, (type|ST_RING_OTH), pri, mcip, mac_srs,
+	    cpuid, rx_func, x_arg1, x_arg2);
+	softring->s_ring_rx_arg2 = NULL;
+}
+
+/*
+ * This routine associates a CPU or a set of CPU to process incoming
+ * traffic from a mac client. If multiple CPUs are specified, then
+ * so many soft rings are created with each soft ring worker thread
+ * bound to a CPU in the set. Each soft ring in turn will be
+ * associated with an squeue and the squeue will be moved to the
+ * same CPU as that of the soft ring's.
+ */
+static void
+mac_srs_fanout_modify(mac_client_impl_t *mcip, flow_entry_t *flent,
+    mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1,
+    mac_resource_handle_t x_arg2, mac_soft_ring_set_t *mac_rx_srs,
+    mac_soft_ring_set_t *mac_tx_srs)
+{
+	mac_soft_ring_t *softring;
+	uint32_t soft_ring_flag = soft_ring_process_flag;
+	processorid_t cpuid = -1;
+	boolean_t user_specified;
+	int i, srings_present, new_fanout_cnt;
+	mac_cpus_t *srs_cpu;
+
+	user_specified = mrp->mrp_mask & MRP_CPUS_USERSPEC;
+	/* fanout state is REINIT. Set it back to INIT */
+	ASSERT(mac_rx_srs->srs_fanout_state == SRS_FANOUT_REINIT);
+	mac_rx_srs->srs_fanout_state = SRS_FANOUT_INIT;
+
+	/* how many are present right now */
+	srings_present = mac_rx_srs->srs_tcp_ring_count;
+	/* new request */
+	srs_cpu = &mac_rx_srs->srs_cpu;
+	new_fanout_cnt = srs_cpu->mc_fanout_cnt;
+
+	mutex_enter(&mac_rx_srs->srs_lock);
+	if (mac_rx_srs->srs_type & SRST_BW_CONTROL)
+		soft_ring_flag |= ST_RING_BW_CTL;
+	mutex_exit(&mac_rx_srs->srs_lock);
+
+	if (new_fanout_cnt > srings_present) {
+		/* soft rings increased */
+		mutex_enter(&mac_rx_srs->srs_lock);
+		mac_rx_srs->srs_type |= SRST_FANOUT_SRC_IP;
+		mutex_exit(&mac_rx_srs->srs_lock);
+
+		for (i = mac_rx_srs->srs_tcp_ring_count;
+		    i < new_fanout_cnt; i++) {
+			/*
+			 * Create the protocol softrings and set the
+			 * DLS bypass where possible.
+			 */
+			mac_srs_create_proto_softrings(i,
+			    (void *)flent, soft_ring_flag,
+			    mac_rx_srs->srs_pri, mcip, mac_rx_srs, cpuid,
+			    rx_func, x_arg1, x_arg2, B_TRUE);
+		}
+		mac_srs_update_fanout_list(mac_rx_srs);
+	} else if (new_fanout_cnt < srings_present) {
+		/* soft rings decreased */
+		if (new_fanout_cnt == 1) {
+			mutex_enter(&mac_rx_srs->srs_lock);
+			mac_rx_srs->srs_type &= ~SRST_FANOUT_SRC_IP;
+			ASSERT(mac_rx_srs->srs_type & SRST_FANOUT_PROTO);
+			mutex_exit(&mac_rx_srs->srs_lock);
+		}
+		/* Get rid of extra soft rings */
+		for (i = new_fanout_cnt;
+		    i < mac_rx_srs->srs_tcp_ring_count; i++) {
+			softring = mac_rx_srs->srs_tcp_soft_rings[i];
+			if (softring->s_ring_rx_arg2 != NULL) {
+				mcip->mci_resource_remove(
+				    (void *)mcip->mci_resource_arg,
+				    softring->s_ring_rx_arg2);
+			}
+			mac_soft_ring_remove(mac_rx_srs,
+			    mac_rx_srs->srs_tcp_soft_rings[i]);
+			mac_soft_ring_remove(mac_rx_srs,
+			    mac_rx_srs->srs_udp_soft_rings[i]);
+			mac_soft_ring_remove(mac_rx_srs,
+			    mac_rx_srs->srs_oth_soft_rings[i]);
+		}
+		mac_srs_update_fanout_list(mac_rx_srs);
+	}
+
+	ASSERT(new_fanout_cnt == mac_rx_srs->srs_tcp_ring_count);
+	mutex_enter(&cpu_lock);
+	for (i = 0; i < mac_rx_srs->srs_tcp_ring_count; i++) {
+		cpuid = srs_cpu->mc_fanout_cpus[i];
+		(void) mac_soft_ring_bind(mac_rx_srs->srs_udp_soft_rings[i],
+		    cpuid);
+		(void) mac_soft_ring_bind(mac_rx_srs->srs_oth_soft_rings[i],
+		    cpuid);
+		(void) mac_soft_ring_bind(mac_rx_srs->srs_tcp_soft_rings[i],
+		    cpuid);
+		softring = mac_rx_srs->srs_tcp_soft_rings[i];
+		if (softring->s_ring_rx_arg2 != NULL) {
+			mcip->mci_resource_bind((void *)mcip->mci_resource_arg,
+			    softring->s_ring_rx_arg2, cpuid);
+		}
+	}
+
+	mac_srs_worker_bind(mac_rx_srs, srs_cpu->mc_pollid);
+	mac_srs_poll_bind(mac_rx_srs, srs_cpu->mc_workerid);
+
+	/*
+	 * Bind Tx srs and soft ring threads too. Let's bind tx
+	 * srs to the last cpu in mrp list.
+	 */
+	if (mac_tx_srs != NULL && user_specified) {
+		BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp);
+	}
+	mutex_exit(&cpu_lock);
+}
+
+/*
+ * Bind SRS threads and soft rings to CPUs/create fanout list.
+ */
+void
+mac_srs_fanout_init(mac_client_impl_t *mcip, flow_entry_t *flent,
+    mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1,
+    mac_resource_handle_t x_arg2, mac_soft_ring_set_t *mac_rx_srs,
+    mac_soft_ring_set_t *mac_tx_srs)
+{
+	int		i;
+	processorid_t	cpuid, worker_cpuid, poll_cpuid;
+	uint32_t	soft_ring_flag = soft_ring_process_flag;
+	int soft_ring_cnt;
+	boolean_t user_specified = B_FALSE;
+	mac_cpus_t *srs_cpu = &mac_rx_srs->srs_cpu;
+
+	/*
+	 * Remove the no soft ring flag and we will adjust it
+	 * appropriately further down.
+	 */
+	mutex_enter(&mac_rx_srs->srs_lock);
+	mac_rx_srs->srs_type &= ~SRST_NO_SOFT_RINGS;
+	mutex_exit(&mac_rx_srs->srs_lock);
+
+	ASSERT(mac_rx_srs->srs_soft_ring_head == NULL);
+
+	if (mac_rx_srs->srs_type & SRST_BW_CONTROL)
+		soft_ring_flag |= ST_RING_BW_CTL;
+
+	ASSERT(mac_rx_srs->srs_fanout_state == SRS_FANOUT_UNINIT);
+	mac_rx_srs->srs_fanout_state = SRS_FANOUT_INIT;
+	user_specified = mrp->mrp_mask & MRP_CPUS_USERSPEC;
+	/*
+	 * Ring count can be 0 if no fanout is required and no cpu
+	 * were specified. Leave the SRS worker and poll thread
+	 * unbound
+	 */
+	ASSERT(mrp != NULL);
+	soft_ring_cnt = srs_cpu->mc_fanout_cnt;
+
+	/* Step 1: bind cpu contains cpu list where threads need to bind */
+	if (soft_ring_cnt > 0) {
+		mutex_enter(&cpu_lock);
+		for (i = 0; i < soft_ring_cnt; i++) {
+			cpuid = srs_cpu->mc_fanout_cpus[i];
+			/* Create the protocol softrings */
+			mac_srs_create_proto_softrings(i, (void *)flent,
+			    soft_ring_flag, mac_rx_srs->srs_pri,
+			    mcip, mac_rx_srs, cpuid, rx_func,
+			    x_arg1, x_arg2, B_FALSE);
+		}
+		worker_cpuid = srs_cpu->mc_workerid;
+		poll_cpuid = srs_cpu->mc_pollid;
+		mac_srs_worker_bind(mac_rx_srs, worker_cpuid);
+		mac_srs_poll_bind(mac_rx_srs, poll_cpuid);
+
+		/*
+		 * Bind Tx srs and soft ring threads too.
+		 * Let's bind tx srs to the last cpu in
+		 * mrp list.
+		 */
+		if (mac_tx_srs == NULL) {
+			mutex_exit(&cpu_lock);
+			goto alldone;
+		}
+
+		if (user_specified) {
+			BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp);
+		}
+		mutex_exit(&cpu_lock);
+	} else {
+		mutex_enter(&cpu_lock);
+		/*
+		 * For a subflow, mrp_workerid and mrp_pollid
+		 * is not set.
+		 */
+		mac_srs_worker_bind(mac_rx_srs, mrp->mrp_workerid);
+		mac_srs_poll_bind(mac_rx_srs, mrp->mrp_pollid);
+		mutex_exit(&cpu_lock);
+		goto no_softrings;
+	}
+
+alldone:
+	if (soft_ring_cnt > 1)
+		mac_rx_srs->srs_type |= SRST_FANOUT_SRC_IP;
+	mac_srs_update_fanout_list(mac_rx_srs);
+	mac_srs_client_poll_enable(mcip, mac_rx_srs);
+	return;
+
+no_softrings:
+	if (mac_rx_srs->srs_type & SRST_FANOUT_PROTO) {
+		mutex_enter(&cpu_lock);
+		cpuid = mac_next_bind_cpu();
+		/* Create the protocol softrings */
+		mac_srs_create_proto_softrings(0, (void *)flent,
+		    soft_ring_flag, mac_rx_srs->srs_pri,
+		    mcip, mac_rx_srs, cpuid, rx_func,
+		    x_arg1, x_arg2, B_FALSE);
+		mutex_exit(&cpu_lock);
+	} else {
+		/*
+		 * This is the case when there is no fanout which is
+		 * true for subflows.
+		 */
+		mac_rx_srs->srs_type |= SRST_NO_SOFT_RINGS;
+	}
+	mac_srs_update_fanout_list(mac_rx_srs);
+	mac_srs_client_poll_enable(mcip, mac_rx_srs);
+}
+
+/*
+ * mac_fanout_setup:
+ *
+ * Calls mac_srs_fanout_init() or modify() depending upon whether
+ * the SRS is getting initialized or re-initialized.
+ */
+void
+mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
+    mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1,
+    mac_resource_handle_t x_arg2)
+{
+	mac_soft_ring_set_t *mac_rx_srs, *mac_tx_srs;
+	int i, rx_srs_cnt;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+	/*
+	 * This is an aggregation port. Fanout will be setup
+	 * over the aggregation itself.
+	 */
+	if (mcip->mci_state_flags & MCIS_IS_AGGR_PORT)
+		return;
+
+	mac_rx_srs = flent->fe_rx_srs[0];
+	/*
+	 * Set up the fanout on the tx side only once, with the
+	 * first rx SRS. The CPU binding, fanout, and bandwidth
+	 * criteria are common to both RX and TX, so
+	 * initializing them along side avoids redundant code.
+	 */
+	mac_tx_srs = flent->fe_tx_srs;
+	rx_srs_cnt = flent->fe_rx_srs_cnt;
+
+	/* No fanout for subflows */
+	if (flent->fe_type & FLOW_USER) {
+		mac_srs_fanout_init(mcip, flent, mrp, rx_func,
+		    x_arg1, x_arg2, mac_rx_srs, mac_tx_srs);
+		return;
+	}
+
+	mac_flow_cpu_init(flent, mrp);
+
+	/*
+	 * Set up fanout for both SW (0th SRS) and HW classified
+	 * SRS (the rest of Rx SRSs in flent).
+	 */
+	for (i = 0; i < rx_srs_cnt; i++) {
+		mac_rx_srs = flent->fe_rx_srs[i];
+		if (i != 0)
+			mac_tx_srs = NULL;
+		switch (mac_rx_srs->srs_fanout_state) {
+		case SRS_FANOUT_UNINIT:
+			mac_srs_fanout_init(mcip, flent, mrp, rx_func,
+			    x_arg1, x_arg2, mac_rx_srs, mac_tx_srs);
+			break;
+		case SRS_FANOUT_INIT:
+			break;
+		case SRS_FANOUT_REINIT:
+			mac_rx_srs_quiesce(mac_rx_srs, SRS_QUIESCE);
+			mac_srs_fanout_modify(mcip, flent, mrp, rx_func,
+			    x_arg1, x_arg2, mac_rx_srs, mac_tx_srs);
+			mac_rx_srs_restart(mac_rx_srs);
+			break;
+		default:
+			VERIFY(mac_rx_srs->srs_fanout_state <=
+			    SRS_FANOUT_REINIT);
+			break;
+		}
+	}
+}
+
+/*
+ * mac_create_soft_ring_set:
+ *
+ * Create a mac_soft_ring_set_t (SRS). If soft_ring_fanout_type is
+ * SRST_TX, an SRS for Tx side is created. Otherwise an SRS for Rx side
+ * processing is created.
+ *
+ * Details on Rx SRS:
+ * Create a SRS and also add the necessary soft rings for TCP and
+ * non-TCP based on fanout type and count specified.
+ *
+ * mac_soft_ring_fanout, mac_srs_fanout_modify (?),
+ * mac_soft_ring_stop_workers, mac_soft_ring_set_destroy, etc need
+ * to be heavily modified.
+ *
+ * mi_soft_ring_list_size, mi_soft_ring_size, etc need to disappear.
+ */
+mac_soft_ring_set_t *
+mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type,
+    mac_direct_rx_t rx_func, void *x_arg1, mac_resource_handle_t x_arg2,
+    mac_ring_t *ring)
+{
+	mac_soft_ring_set_t 	*mac_srs;
+	mac_srs_rx_t		*srs_rx;
+	mac_srs_tx_t		*srs_tx;
+	mac_bw_ctl_t		*mac_bw;
+	mac_resource_props_t	*mrp;
+	boolean_t		is_tx_srs = ((srs_type & SRST_TX) != 0);
+
+	mac_srs = kmem_cache_alloc(mac_srs_cache, KM_SLEEP);
+	bzero(mac_srs, sizeof (mac_soft_ring_set_t));
+	srs_rx = &mac_srs->srs_rx;
+	srs_tx = &mac_srs->srs_tx;
+
+	mutex_enter(&flent->fe_lock);
+
+	/*
+	 * Get the bandwidth control structure from the flent. Get
+	 * rid of any residual values in the control structure for
+	 * the tx bw struct and also for the rx, if the rx srs is
+	 * the 1st one being brought up (the rx bw ctl struct may
+	 * be shared by multiple SRSs)
+	 */
+	if (is_tx_srs) {
+		mac_srs->srs_bw = &flent->fe_tx_bw;
+		bzero(mac_srs->srs_bw, sizeof (mac_bw_ctl_t));
+		flent->fe_tx_srs = mac_srs;
+	} else {
+		/*
+		 * The bw counter (stored in the flent) is shared
+		 * by SRS's within an rx group.
+		 */
+		mac_srs->srs_bw = &flent->fe_rx_bw;
+		/* First rx SRS, clear the bw structure */
+		if (flent->fe_rx_srs_cnt == 0)
+			bzero(mac_srs->srs_bw, sizeof (mac_bw_ctl_t));
+		ASSERT(flent->fe_rx_srs_cnt < MAX_RINGS_PER_GROUP);
+		flent->fe_rx_srs[flent->fe_rx_srs_cnt] = mac_srs;
+		flent->fe_rx_srs_cnt++;
+	}
+	mac_srs->srs_flent = flent;
+	mutex_exit(&flent->fe_lock);
+
+	mac_srs->srs_state = 0;
+	mac_srs->srs_type = (srs_type | SRST_NO_SOFT_RINGS);
+	mac_srs->srs_worker_cpuid = mac_srs->srs_worker_cpuid_save = -1;
+	mac_srs->srs_poll_cpuid = mac_srs->srs_poll_cpuid_save = -1;
+	mac_srs_fanout_list_alloc(mac_srs);
+
+	/*
+	 * For a flow we use the underlying MAC client's priority range with
+	 * the priority value to find an absolute priority value. For a MAC
+	 * client we use the MAC client's maximum priority as the value.
+	 */
+	mrp = &flent->fe_effective_props;
+	if ((mac_srs->srs_type & SRST_FLOW) != 0) {
+		mac_srs->srs_pri = FLOW_PRIORITY(mcip->mci_min_pri,
+		    mcip->mci_max_pri, mrp->mrp_priority);
+	} else {
+		mac_srs->srs_pri = mcip->mci_max_pri;
+	}
+	mac_srs->srs_mcip = mcip;
+	/*
+	 * We need to insert the SRS in the global list before
+	 * binding the SRS and SR threads. Otherwise there is a
+	 * is a small window where the cpu reconfig callbacks
+	 * may miss the SRS in the list walk and DR could fail
+	 * as there are bound threads.
+	 */
+	mac_srs_add_glist(mac_srs);
+
+	/* Initialize bw limit */
+	if ((mrp->mrp_mask & MRP_MAXBW) != 0) {
+		mac_srs->srs_drain_func = mac_rx_srs_drain_bw;
+
+		mac_bw = mac_srs->srs_bw;
+		mutex_enter(&mac_bw->mac_bw_lock);
+		mac_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw);
+
+		/*
+		 * Give twice the queuing capability before
+		 * dropping packets. The unit is bytes/tick.
+		 */
+		mac_bw->mac_bw_drop_threshold = mac_bw->mac_bw_limit << 1;
+		mutex_exit(&mac_bw->mac_bw_lock);
+		mac_srs->srs_type |= SRST_BW_CONTROL;
+	} else {
+		mac_srs->srs_drain_func = mac_rx_srs_drain;
+	}
+
+	/*
+	 * We use the following policy to control Receive
+	 * Side Dynamic Polling:
+	 * 1) We switch to poll mode anytime the processing thread causes
+	 *    a backlog to build up in SRS and its associated Soft Rings
+	 *    (sr_poll_pkt_cnt > 0).
+	 * 2) As long as the backlog stays under the low water mark
+	 *    (sr_lowat), we poll the H/W for more packets.
+	 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low water mark, we
+	 *    stay in poll mode but don't poll the H/W for more packets.
+	 * 4) Anytime in polling mode, if we poll the H/W for packets and
+	 *    find nothing plus we have an existing backlog
+	 *    (sr_poll_pkt_cnt > 0), we stay in polling mode but don't poll
+	 *    the H/W for packets anymore (let the polling thread go to sleep).
+	 * 5) Once the backlog is relived (packets are processed) we reenable
+	 *    polling (by signalling the poll thread) only when the backlog
+	 *    dips below sr_poll_thres.
+	 * 6) sr_hiwat is used exclusively when we are not polling capable
+	 *    and is used to decide when to drop packets so the SRS queue
+	 *    length doesn't grow infinitely.
+	 */
+	if (!is_tx_srs) {
+		srs_rx->sr_hiwat = mac_soft_ring_max_q_cnt;
+		/* Low water mark needs to be less than high water mark */
+		srs_rx->sr_lowat = mac_soft_ring_min_q_cnt <=
+		    mac_soft_ring_max_q_cnt ? mac_soft_ring_min_q_cnt :
+		    (mac_soft_ring_max_q_cnt >> 2);
+		/* Poll threshold need to be half of low water mark or less */
+		srs_rx->sr_poll_thres = mac_soft_ring_poll_thres <=
+		    (srs_rx->sr_lowat >> 1) ? mac_soft_ring_poll_thres :
+		    (srs_rx->sr_lowat >> 1);
+		if (mac_latency_optimize)
+			mac_srs->srs_state |= SRS_LATENCY_OPT;
+	}
+
+	mac_srs->srs_worker = thread_create(NULL, 0,
+	    mac_srs_worker, mac_srs, 0, &p0, TS_RUN, mac_srs->srs_pri);
+
+	if (is_tx_srs) {
+		/* Handle everything about Tx SRS and return */
+		mac_srs->srs_drain_func = mac_tx_srs_drain;
+		srs_tx->st_max_q_cnt = mac_tx_srs_max_q_cnt;
+		srs_tx->st_hiwat =
+		    (mac_tx_srs_hiwat > mac_tx_srs_max_q_cnt) ?
+		    mac_tx_srs_max_q_cnt : mac_tx_srs_hiwat;
+		srs_tx->st_arg1 = x_arg1;
+		srs_tx->st_arg2 = x_arg2;
+		return (mac_srs);
+	}
+
+	if ((srs_type & SRST_FLOW) != 0 ||
+	    FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
+		srs_rx->sr_lower_proc = mac_rx_srs_process;
+	else
+		srs_rx->sr_lower_proc = mac_rx_srs_subflow_process;
+
+	srs_rx->sr_func = rx_func;
+	srs_rx->sr_arg1 = x_arg1;
+	srs_rx->sr_arg2 = x_arg2;
+
+	if (ring != NULL) {
+		/* Is the mac_srs created over the RX default group? */
+		if (ring->mr_gh == (mac_group_handle_t)
+		    (&mcip->mci_mip->mi_rx_groups[0]))
+			mac_srs->srs_type |= SRST_DEFAULT_GRP;
+
+		mac_srs->srs_ring = ring;
+		ring->mr_srs = mac_srs;
+		ring->mr_classify_type = MAC_HW_CLASSIFIER;
+		ring->mr_flag |= MR_INCIPIENT;
+
+		if (FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
+			mac_srs->srs_state |= SRS_POLLING_CAPAB;
+
+		mac_srs->srs_poll_thr = thread_create(NULL, 0,
+		    mac_rx_srs_poll_ring, mac_srs, 0, &p0, TS_RUN,
+		    mac_srs->srs_pri);
+	}
+	return (mac_srs);
+}
+
+/*
+ * Figure out the number of soft rings required. Its dependant on
+ * if protocol fanout is required (for LINKs), global settings
+ * require us to do fanout for performance (based on mac_soft_ring_enable),
+ * or user has specifically requested fanout.
+ */
+static uint32_t
+mac_find_fanout(flow_entry_t *flent, uint32_t link_type)
+{
+	uint32_t			fanout_type;
+	mac_resource_props_t		*mrp = &flent->fe_effective_props;
+
+	/* no fanout for subflows */
+	switch (link_type) {
+	case SRST_FLOW:
+		fanout_type = SRST_NO_SOFT_RINGS;
+		break;
+	case SRST_LINK:
+		fanout_type = SRST_FANOUT_PROTO;
+		break;
+	}
+
+	/* A primary NIC/link is being plumbed */
+	if (flent->fe_type & FLOW_PRIMARY_MAC) {
+		if (mac_soft_ring_enable && mac_rx_soft_ring_count > 1) {
+			fanout_type |= SRST_FANOUT_SRC_IP;
+		}
+	} else if (flent->fe_type & FLOW_VNIC) {
+		/* A VNIC is being created */
+		if (mrp != NULL && mrp->mrp_ncpus > 0) {
+			fanout_type |= SRST_FANOUT_SRC_IP;
+		}
+	}
+
+	return (fanout_type);
+}
+
+/*
+ * Change a group from h/w to s/w classification.
+ */
+static void
+mac_rx_switch_grp_to_sw(mac_group_t *group)
+{
+	mac_ring_t		*ring;
+	mac_soft_ring_set_t	*mac_srs;
+
+	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
+		if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
+			/*
+			 * Remove the SRS associated with the HW ring.
+			 * As a result, polling will be disabled.
+			 */
+			mac_srs = ring->mr_srs;
+			ASSERT(mac_srs != NULL);
+			mac_rx_srs_remove(mac_srs);
+			ring->mr_srs = NULL;
+		}
+
+		if (ring->mr_state != MR_INUSE)
+			(void) mac_start_ring(ring);
+		/*
+		 * We need to perform SW classification
+		 * for packets landing in these rings
+		 */
+		ring->mr_state = MR_INUSE;
+		ring->mr_flag = 0;
+		ring->mr_classify_type = MAC_SW_CLASSIFIER;
+	}
+}
+
+/*
+ * Create the Rx SRS for S/W classifier and for each ring in the
+ * group (if exclusive group). Also create the Tx SRS.
+ */
+void
+mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
+    mac_group_t *group, uint32_t link_type)
+{
+	mac_impl_t		*mip = mcip->mci_mip;
+	mac_soft_ring_set_t	*mac_srs;
+	mac_soft_ring_set_t	*tx_srs = NULL;
+	mac_ring_t 		*ring;
+	uint32_t		fanout_type;
+	boolean_t		created_srs = B_FALSE;
+
+	fanout_type = mac_find_fanout(flent, link_type);
+
+	/* Create the SRS for S/W classification if none exists */
+	if (flent->fe_rx_srs[0] == NULL) {
+		ASSERT(flent->fe_rx_srs_cnt == 0);
+		/* Setup the Rx SRS */
+		mac_srs = mac_srs_create(mcip, flent, fanout_type | link_type,
+		    mac_rx_deliver, mcip, NULL, NULL);
+
+		mutex_enter(&flent->fe_lock);
+		flent->fe_cb_fn = (flow_fn_t)mac_srs->srs_rx.sr_lower_proc;
+		flent->fe_cb_arg1 = (void *)mip;
+		flent->fe_cb_arg2 = (void *)mac_srs;
+		mutex_exit(&flent->fe_lock);
+
+		/* Setup the Tx SRS as well */
+		ASSERT(flent->fe_tx_srs == NULL);
+		tx_srs = mac_srs_create(mcip, flent, SRST_TX | link_type,
+		    NULL, mcip, NULL, NULL);
+
+		if (mcip->mci_share != NULL) {
+			mac_srs_tx_t	*tx = &tx_srs->srs_tx;
+			ASSERT(!mcip->mci_no_hwrings);
+			/*
+			 * A share requires a dedicated TX group.
+			 * mac_reserve_tx_group() does the work needed to
+			 * allocate a new group and populate that group
+			 * with rings according to the driver requirements
+			 * and limitations.
+			 */
+			tx->st_group =
+			    mac_reserve_tx_group(mip, mcip->mci_share);
+			ASSERT(tx->st_group != NULL);
+			tx->st_group->mrg_tx_client = mcip;
+		}
+		mac_tx_srs_setup(mcip, flent, link_type);
+		created_srs = B_TRUE;
+	}
+
+	if (group == NULL) {
+		if (created_srs) {
+			mac_fanout_setup(mcip, flent,
+			    MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver,
+			    mcip, NULL);
+		}
+		return;
+	}
+
+	/*
+	 * fanout for default SRS is done when default SRS are created
+	 * above. As each ring is added to the group, we setup the
+	 * SRS and fanout to it.
+	 */
+	switch (group->mrg_state) {
+	case MAC_GROUP_STATE_RESERVED:
+		/*
+		 * The group is exclusively ours. Create a SRS
+		 * for each ring in the group and allow the
+		 * individual SRS to dynamically poll their
+		 * Rx ring. Do this only if the  client is not
+		 * a VLAN MAC client since for VLAN we do
+		 * s/w classification for the VID check.
+		 */
+		if (i_mac_flow_vid(mcip->mci_flent) != VLAN_ID_NONE)
+			break;
+		for (ring = group->mrg_rings; ring != NULL;
+		    ring = ring->mr_next) {
+			switch (ring->mr_state) {
+			case MR_INUSE:
+			case MR_FREE:
+				if (ring->mr_srs != NULL)
+					break;
+				if (ring->mr_state != MR_INUSE)
+					(void) mac_start_ring(ring);
+
+				ring->mr_state = MR_INUSE;
+
+				mac_srs = mac_srs_create(mcip, flent,
+				    fanout_type | link_type,
+				    mac_rx_deliver, mcip, NULL, ring);
+				if (mip->mi_v12n_level & MAC_VIRT_SERIALIZE) {
+					mac_srs->srs_rx.sr_enqueue_always =
+					    B_TRUE;
+				}
+				break;
+			default:
+				cmn_err(CE_PANIC, "srs_setup: mcip = %p "
+				    "trying to add UNKNOWN ring = %p\n",
+				    (void *)mcip, (void *)ring);
+				break;
+			}
+		}
+		break;
+	case MAC_GROUP_STATE_SHARED:
+		/*
+		 * Set all rings of this group to software classified.
+		 *
+		 * If the group is current RESERVED, the existing mac client
+		 * (the only client on this group) is using this group
+		 * exclusively.  In that case we need to disable polling on
+		 * the rings of the group (if it was enabled), and free the
+		 * SRS associated with the rings.
+		 */
+		mac_rx_switch_grp_to_sw(group);
+		break;
+	default:
+		ASSERT(B_FALSE);
+		break;
+	}
+	mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
+	    mac_rx_deliver, mcip, NULL);
+}
+
+void
+mac_srs_group_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
+    uint32_t link_type)
+{
+	mac_soft_ring_set_t	*mac_srs;
+	mac_soft_ring_set_t	*tx_srs;
+	mac_srs_tx_t		*tx;
+	int			i;
+
+	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
+		mac_srs = flent->fe_rx_srs[i];
+		mac_rx_srs_quiesce(mac_srs, SRS_CONDEMNED);
+		/*
+		 * Deal with all fanout tear down etc.
+		 */
+		mac_srs_free(mac_srs);
+		flent->fe_rx_srs[i] = NULL;
+	}
+	flent->fe_rx_srs_cnt = 0;
+
+	tx_srs = flent->fe_tx_srs;
+	tx = &tx_srs->srs_tx;
+	switch (link_type) {
+	case SRST_FLOW:
+		/*
+		 * For flows, we need to work with passed
+		 * flent to find the Rx/Tx SRS.
+		 */
+		mac_tx_srs_quiesce(tx_srs, SRS_CONDEMNED);
+		break;
+	case SRST_LINK:
+		mac_tx_client_quiesce(mcip, SRS_CONDEMNED);
+		/*
+		 * Release the TX resources. First the TX group, if any
+		 * was assigned to the MAC client, which will cause the
+		 * TX rings to be moved back to the pool. Then free the
+		 * rings themselves.
+		 */
+		if (tx->st_group != NULL) {
+			mac_release_tx_group(tx_srs->srs_mcip->mci_mip,
+			    tx->st_group);
+			tx->st_group = NULL;
+		}
+		if (tx->st_arg2 != NULL) {
+			ASSERT(tx_srs->srs_type & SRST_TX);
+			mac_release_tx_ring(tx->st_arg2);
+		}
+		break;
+	default:
+		ASSERT(B_FALSE);
+		break;
+	}
+	mac_srs_free(tx_srs);
+	flent->fe_tx_srs = NULL;
+}
+
+/*
+ * This is the group state machine. The state of an Rx group is given by
+ * the following table. The default group and its rings are started in
+ * mac_start itself and the default group stays in SHARED state until
+ * mac_stop at which time the group and rings are stopped and and it
+ * reverts to the Registered state.
+ *
+ * Typically this function is called on a group after adding or removing a
+ * client from it, to find out what should be the new state of the group.
+ * If the new state is RESERVED, then the client that owns this group
+ * exclusively is also returned. Note that adding or removing a client from
+ * a group could also impact the default group and the caller needs to
+ * evaluate the effect on the default group.
+ *
+ * Group type		# of clients	mi_nactiveclients	Group State
+ *			in the group
+ *
+ * Non-default		0		N.A.			REGISTERED
+ * Non-default		1		N.A.			RESERVED
+ * Non-default		> 1		N.A.			SHARED
+ *
+ * Default		0		N.A.			SHARED
+ * Default		1		1			RESERVED
+ * Default		1		> 1			SHARED
+ * Default		> 1		N.A.			SHARED
+ */
+mac_group_state_t
+mac_rx_group_next_state(mac_group_t *grp, mac_client_impl_t **group_only_mcip)
+{
+	mac_impl_t		*mip = (mac_impl_t *)grp->mrg_mh;
+
+	*group_only_mcip = NULL;
+
+	/* Non-default group */
+
+	if (grp != mip->mi_rx_groups) {
+		if (MAC_RX_GROUP_NO_CLIENT(grp))
+			return (MAC_GROUP_STATE_REGISTERED);
+
+		*group_only_mcip = MAC_RX_GROUP_ONLY_CLIENT(grp);
+		if (*group_only_mcip != NULL)
+			return (MAC_GROUP_STATE_RESERVED);
+
+		return (MAC_GROUP_STATE_SHARED);
+	}
+
+	/* Default group */
+
+	if (MAC_RX_GROUP_NO_CLIENT(grp) || mip->mi_nactiveclients != 1)
+		return (MAC_GROUP_STATE_SHARED);
+
+	*group_only_mcip = MAC_RX_GROUP_ONLY_CLIENT(grp);
+	ASSERT(*group_only_mcip != NULL);
+	return (MAC_GROUP_STATE_RESERVED);
+}
+
+/*
+ * OVERVIEW NOTES FOR DATAPATH
+ * ===========================
+ *
+ * Create an SRS and setup the corresponding flow function and args.
+ * Add a classification rule for the flow specified by 'flent' and program
+ * the hardware classifier when applicable.
+ *
+ * Rx ring assignment, SRS, polling and B/W enforcement
+ * ----------------------------------------------------
+ *
+ * We try to use H/W classification on NIC and assign traffic to a
+ * MAC address to a particular Rx ring. There is a 1-1 mapping
+ * between a SRS and a Rx ring. The SRS (short for soft ring set)
+ * dynamically switches the underlying Rx ring between interrupt
+ * and polling mode and enforces any specified B/W control.
+ *
+ * There is always a SRS created and tied to each H/W and S/W rule.
+ * Whenever we create a H/W rule, we always add the the same rule to
+ * S/W classifier and tie a SRS to it.
+ *
+ * In case a B/W control is specified, its broken into bytes
+ * per ticks and as soon as the quota for a tick is exhausted,
+ * the underlying Rx ring is forced into poll mode for remianing
+ * tick. The SRS poll thread only polls for bytes that are
+ * allowed to come in the SRS. We typically let 4x the configured
+ * B/W worth of packets to come in the SRS (to prevent unnecessary
+ * drops due to bursts) but only process the specified amount.
+ *
+ * A Link (primary NIC, VNIC, VLAN or aggr) can have 1 or more
+ * Rx rings (and corresponding SRSs) assigned to it. The SRS
+ * in turn can have softrings to do protocol level fanout or
+ * softrings to do S/W based fanout or both. In case the NIC
+ * has no Rx rings, we do S/W classification to respective SRS.
+ * The S/W classification rule is always setup and ready. This
+ * allows the MAC layer to reassign Rx rings whenever needed
+ * but packets still continue to flow via the default path and
+ * getting S/W classified to correct SRS.
+ *
+ * In other cases where a NIC or VNIC is plumbed, our goal is use
+ * H/W classifier and get two Rx ring assigned for the Link. One
+ * for TCP and one for UDP|SCTP. The respective SRS still do the
+ * polling on the Rx ring. For Link that is plumbed for IP, there
+ * is a TCP squeue which also does polling and can control the
+ * the Rx ring directly (where SRS is just pass through). For
+ * the following cases, the SRS does the polling underneath.
+ * 1) non IP based Links (Links which are not plumbed via ifconfig)
+ *    and paths which have no IP squeues (UDP & SCTP)
+ * 2) If B/W control is specified on the Link
+ * 3) If S/W fanout is secified
+ *
+ * Note1: As of current implementation, we try to assign only 1 Rx
+ * ring per Link and more than 1 Rx ring for primary Link for
+ * H/W based fanout. We always create following softrings per SRS:
+ * 1) TCP softring which is polled by TCP squeue where possible
+ *    (and also bypasses DLS)
+ * 2) UDP/SCTP based which bypasses DLS
+ * 3) OTH softring which goes via DLS (currently deal with IPv6
+ *    and non TCP/UDP/SCTP for IPv4 packets).
+ *
+ * It is necessary to create 3 softrings since SRS has to poll
+ * the single Rx ring underneath and enforce any link level B/W
+ * control (we can't switch the Rx ring in poll mode just based
+ * on TCP squeue if the same Rx ring is sharing UDP and other
+ * traffic as well). Once polling is done and any Link level B/W
+ * control is specified, the packets are assigned to respective
+ * softring based on protocol. Since TCP has IP based squeue
+ * which benefits by polling, we separate TCP packets into
+ * its own softring which can be polled by IP squeue. We need
+ * to separate out UDP/SCTP to UDP softring since it can bypass
+ * the DLS layer which has heavy performance advanatges and we
+ * need a softring (OTH) for rest.
+ *
+ * ToDo: The 3 softrings for protocol are needed only till we can
+ * get rid of DLS from datapath, make IPv4 and IPv6 paths
+ * symmetric (deal with mac_header_info for v6 and polling for
+ * IPv4 TCP - ip_accept_tcp is IPv4 specific although squeues
+ * are generic), and bring SAP based classification to MAC layer
+ *
+ * H/W and S/W based fanout and multiple Rx rings per Link
+ * -------------------------------------------------------
+ *
+ * In case, fanout is requested (or determined automatically based
+ * on Link speed and processor speed), we try to assign multiple
+ * Rx rings per Link with their respective SRS. In this case
+ * the NIC should be capable of fanning out incoming packets between
+ * the assigned Rx rings (H/W based fanout). All the SRS
+ * individually switch their Rx ring between interrupt and polling
+ * mode but share a common B/W control counter in case of Link
+ * level B/W is specified.
+ *
+ * If S/W based fanout is specified in lieu of H/W based fanout,
+ * the Link SRS creates the specified number of softrings for
+ * each protocol (TCP, UDP, OTH). Incoming packets are fanned
+ * out to the correct softring based on their protocol and
+ * protocol specific hash function.
+ *
+ * Primary and non primary MAC clients
+ * -----------------------------------
+ *
+ * The NICs, VNICs, Vlans, and Aggrs are typically termed as Links
+ * and are a Layer 2 construct.
+ *
+ * Primary NIC:
+ *	The Link that owns the primary MAC address and typically
+ *	is used as the data NIC in non virtualized cases. As such
+ *	H/W resources are preferntially given to primary NIC. As
+ *	far as code is concerned, there is no difference in the
+ *	primary NIC vs VNICs. They are all treated as Links.
+ *	At the very first call to mac_unicast_add() we program the S/W
+ *	classifier for the primary MAC address, get a soft ring set
+ *	(and soft rings based on 'ip_soft_ring_cnt')
+ *	and a Rx ring assigned for polling to get enabled.
+ *	When IP get plumbed and negotiates polling, we can
+ *	let squeue do the polling on TCP softring.
+ *
+ * VNICs:
+ *	Same as any other Link. As long as the H/W resource assignments
+ *	are equal, the data path and setup for all Links is same.
+ *
+ * Flows:
+ *	Can be configured on Links. They have their own SRS and the
+ *	S/W classifier is programmed appropriately based on the flow.
+ *	The flows typically deal with layer 3 and above and
+ *	creates a soft ring set specific to the flow. The receive
+ *	side function is switched from mac_rx_srs_process to
+ *	mac_rx_srs_subflow_process which first tries to assign the
+ *	packet to appropriate flow SRS and failing which assigns it
+ *	to link SRS. This allows us to avoid the layered approach
+ *	which gets complex.
+ *
+ * By the time mac_datapath_setup() completes, we already have the
+ * soft rings set, Rx rings, soft rings, etc figured out and both H/W
+ * and S/W classifiers programmed. IP is not plumbed yet (and might
+ * never be for Virtual Machines guest OS path). When IP is plumbed
+ * (for both NIC and VNIC), we do a capability negotiation for polling
+ * and upcall functions etc.
+ *
+ * Rx ring Assignement NOTES
+ * -------------------------
+ *
+ * For NICs which have only 1 Rx ring (we treat  NICs with no Rx rings
+ * as NIC with a single default ring), we assign the only ring to
+ * primary Link as MAC_RX_HW_DEFAULT_RING. The primary Link SRS can do
+ * polling on it as long as it is the only link in use and we compare
+ * the MAC address for unicast packets before accepting an incoming
+ * packet (there is no need for S/W classification in this case). We
+ * disable polling on the only ring the moment 2nd link gets created
+ * (the polling remains enabled even though there are broadcast and
+ * multicast flows created).
+ *
+ * If the NIC has more than 1 Rx ring, we assign the default ring (the
+ * 1st ring) to deal with broadcast, multicast and traffic for other
+ * NICs which needs S/W classification. We assign the primary mac
+ * addresses to another ring by specifiying a classification rule for
+ * primary unicast MAC address to the selected ring. The primary Link
+ * (and its SRS) can continue to poll the assigned Rx ring at all times
+ * independantly.
+ *
+ * Right now we just assign MAC_RX_HW_DEFAULT_RING to note that it is
+ * primary NIC and later we will check to see how many Rx rings we
+ * have and can we get a non default Rx ring for the primary MAC.
+ *
+ * Note: In future, if no fanout is specified, we try to assign 2 Rx
+ * rings for the primary Link with the primary MAC address + TCP going
+ * to one ring and primary MAC address + UDP|SCTP going to other ring.
+ * Any remaining traffic for primary MAC address can go to the default
+ * Rx ring and get S/W classified. This way the respective SRSs don't
+ * need to do proto fanout and don't need to have softrings at all and
+ * can poll their respective Rx rings.
+ *
+ * As an optimization, when a new NIC or VNIC is created, we can get
+ * only one Rx ring and make it a TCP specific Rx ring and use the
+ * H/W default Rx ring for the rest (this Rx ring is never polled).
+ */
+int
+mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
+    uint32_t link_type)
+{
+	mac_impl_t		*mip = mcip->mci_mip;
+	mac_group_t		*group = NULL;
+	mac_group_t		*default_group;
+	int			err;
+	uint8_t 		*mac_addr;
+	mac_rx_group_reserve_type_t	rtype = MAC_RX_RESERVE_NONDEFAULT;
+	mac_group_state_t	next_state;
+	mac_client_impl_t	*group_only_mcip;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	switch (link_type) {
+	case SRST_FLOW:
+		mac_srs_group_setup(mcip, flent, NULL, link_type);
+		return (0);
+
+	case SRST_LINK:
+		mac_addr = flent->fe_flow_desc.fd_dst_mac;
+
+		/* Check if we need to reserve the default group */
+		if (flent->fe_type & FLOW_PRIMARY_MAC)
+			rtype = MAC_RX_RESERVE_DEFAULT;
+
+		if (!mcip->mci_no_hwrings) {
+			/*
+			 * Check to see if we can get an exclusive group for
+			 * this mac address or if there already exists a
+			 * group that has this mac address (case of VLANs).
+			 * If no groups are available, use the default group.
+			 */
+			group = mac_reserve_rx_group(mcip, mac_addr, rtype);
+		}
+
+		if (group == NULL) {
+			if (mcip->mci_req_hwrings)
+				return (ENOSPC);
+			group = &mip->mi_rx_groups[0];
+		}
+
+		/*
+		 * Some NICs don't support any Rx rings, so there may not
+		 * even be a default group.
+		 */
+		if (group != NULL) {
+			flent->fe_rx_ring_group = group;
+			/*
+			 * Add the client to the group. This could cause
+			 * either this group to move to the shared state or
+			 * cause the default group to move to the shared state.
+			 * The actions on this group are done here, while the
+			 * actions on the default group are postponed to
+			 * the end of this function.
+			 */
+			mac_rx_group_add_client(group, mcip);
+			next_state = mac_rx_group_next_state(group,
+			    &group_only_mcip);
+
+			ASSERT((next_state == MAC_GROUP_STATE_RESERVED &&
+			    mcip == group_only_mcip) ||
+			    (next_state == MAC_GROUP_STATE_SHARED &&
+			    group_only_mcip == NULL));
+
+			mac_set_rx_group_state(group, next_state);
+		}
+
+		/*
+		 * Setup the Rx and Tx SRSes. If we got a pristine group
+		 * exclusively above, mac_srs_group_setup would simply create
+		 * the required SRSes. If we ended up sharing a previously
+		 * reserved group, mac_srs_group_setup would also dismantle the
+		 * SRSes of the previously exclusive group
+		 */
+		mac_srs_group_setup(mcip, flent, group, link_type);
+
+		/* Program the S/W Classifer */
+		if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0)
+			goto setup_failed;
+
+		/* Program the H/W Classifier */
+		if ((err = mac_add_macaddr(mip, group, mac_addr)) != 0)
+			goto setup_failed;
+		mcip->mci_unicast = mac_find_macaddr(mip, mac_addr);
+		ASSERT(mcip->mci_unicast != NULL);
+		break;
+
+	default:
+		ASSERT(B_FALSE);
+		break;
+	}
+
+	/*
+	 * All broadcast and multicast traffic is received only on the default
+	 * group. If we have setup the datapath for a non-default group above
+	 * then move the default group to shared state to allow distribution of
+	 * incoming broadcast traffic to the other groups and dismantle the
+	 * SRSes over the default group.
+	 */
+	if (group != NULL) {
+		if (group != mip->mi_rx_groups) {
+			default_group = mip->mi_rx_groups;
+			if (default_group->mrg_state ==
+			    MAC_GROUP_STATE_RESERVED) {
+				group_only_mcip = MAC_RX_GROUP_ONLY_CLIENT(
+				    default_group);
+				ASSERT(group_only_mcip != NULL &&
+				    mip->mi_nactiveclients > 1);
+
+				mac_set_rx_group_state(default_group,
+				    MAC_GROUP_STATE_SHARED);
+				mac_srs_group_setup(group_only_mcip,
+				    group_only_mcip->mci_flent,
+				    default_group, SRST_LINK);
+			}
+			ASSERT(default_group->mrg_state ==
+			    MAC_GROUP_STATE_SHARED);
+		}
+		/*
+		 * If we get an exclusive group for a VLAN MAC client we
+		 * need to take the s/w path to make the additional check for
+		 * the vid. Disable polling and set it to s/w classification.
+		 */
+		if (group->mrg_state == MAC_GROUP_STATE_RESERVED &&
+		    i_mac_flow_vid(mcip->mci_flent) != VLAN_ID_NONE) {
+			mac_rx_switch_grp_to_sw(group);
+		}
+	}
+	return (0);
+
+setup_failed:
+	mac_datapath_teardown(mcip, flent, link_type);
+	return (err);
+}
+
+void
+mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent,
+    uint32_t link_type)
+{
+	mac_impl_t		*mip = mcip->mci_mip;
+	mac_group_t		*group = NULL;
+	mac_client_impl_t	*grp_only_mcip;
+	flow_entry_t		*group_only_flent;
+	mac_group_t		*default_group;
+	boolean_t		check_default_group = B_FALSE;
+	mac_group_state_t	next_state;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	switch (link_type) {
+	case SRST_FLOW:
+		mac_srs_group_teardown(mcip, flent, SRST_FLOW);
+		return;
+
+	case SRST_LINK:
+		/* Stop sending packets */
+		mac_tx_client_block(mcip);
+
+		/* Stop the packets coming from the H/W */
+		if (mcip->mci_unicast != NULL) {
+			int err;
+			err = mac_remove_macaddr(mcip->mci_unicast);
+			if (err != 0) {
+				cmn_err(CE_WARN, "%s: failed to remove a MAC"
+				    " address because of error 0x%x",
+				    mip->mi_name, err);
+			}
+			mcip->mci_unicast = NULL;
+		}
+
+		/* Stop the packets coming from the S/W classifier */
+		mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE);
+		mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
+
+		/* Now quiesce and destroy all SRS and soft rings */
+		mac_srs_group_teardown(mcip, flent, SRST_LINK);
+		ASSERT((mcip->mci_flent == flent) &&
+		    (flent->fe_next == NULL));
+
+		/*
+		 * Release our hold on the group as well. We need
+		 * to check if the shared group has only one client
+		 * left who can use it exclusively. Also, if we
+		 * were the last client, release the group.
+		 */
+		group = flent->fe_rx_ring_group;
+		if (group != NULL) {
+			mac_rx_group_remove_client(group, mcip);
+			next_state = mac_rx_group_next_state(group,
+			    &grp_only_mcip);
+			if (next_state == MAC_GROUP_STATE_RESERVED) {
+				/*
+				 * Only one client left on this RX group.
+				 */
+				ASSERT(grp_only_mcip != NULL);
+				mac_set_rx_group_state(group,
+				    MAC_GROUP_STATE_RESERVED);
+				group_only_flent = grp_only_mcip->mci_flent;
+
+				/*
+				 * The only remaining client has exclusive
+				 * access on the group. Allow it to
+				 * dynamically poll the H/W rings etc.
+				 */
+				mac_srs_group_setup(grp_only_mcip,
+				    group_only_flent, group, SRST_LINK);
+				mac_rx_group_unmark(group, MR_INCIPIENT);
+			} else if (next_state == MAC_GROUP_STATE_REGISTERED) {
+				/*
+				 * This is a non-default group being freed up.
+				 * We need to reevaluate the default group
+				 * to see if the primary client can get
+				 * exclusive access to the default group.
+				 */
+				ASSERT(group != mip->mi_rx_groups);
+				mac_release_rx_group(mcip, group);
+				mac_set_rx_group_state(group,
+				    MAC_GROUP_STATE_REGISTERED);
+				check_default_group = B_TRUE;
+			} else {
+				ASSERT(next_state == MAC_GROUP_STATE_SHARED);
+				mac_set_rx_group_state(group,
+				    MAC_GROUP_STATE_SHARED);
+				mac_rx_group_unmark(group, MR_CONDEMNED);
+			}
+			flent->fe_rx_ring_group = NULL;
+		}
+		break;
+	default:
+		ASSERT(B_FALSE);
+		break;
+	}
+
+	/*
+	 * The mac client using the default group gets exclusive access to the
+	 * default group if and only if it is the sole client on the entire
+	 * mip. If so set the group state to reserved, and set up the SRSes
+	 * over the default group.
+	 */
+	if (check_default_group) {
+		default_group = mip->mi_rx_groups;
+		ASSERT(default_group->mrg_state == MAC_GROUP_STATE_SHARED);
+		next_state = mac_rx_group_next_state(default_group,
+		    &grp_only_mcip);
+		if (next_state == MAC_GROUP_STATE_RESERVED) {
+			ASSERT(grp_only_mcip != NULL &&
+			    mip->mi_nactiveclients == 1);
+			mac_set_rx_group_state(default_group,
+			    MAC_GROUP_STATE_RESERVED);
+			mac_srs_group_setup(grp_only_mcip,
+			    grp_only_mcip->mci_flent,
+			    default_group, SRST_LINK);
+		}
+	}
+}
+
+/* DATAPATH TEAR DOWN ROUTINES (SRS and FANOUT teardown) */
+
+static void
+mac_srs_fanout_list_free(mac_soft_ring_set_t *mac_srs)
+{
+	ASSERT(mac_srs->srs_tcp_soft_rings != NULL);
+	kmem_free(mac_srs->srs_tcp_soft_rings,
+	    sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT);
+	mac_srs->srs_tcp_soft_rings = NULL;
+	ASSERT(mac_srs->srs_udp_soft_rings != NULL);
+	kmem_free(mac_srs->srs_udp_soft_rings,
+	    sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT);
+	mac_srs->srs_udp_soft_rings = NULL;
+	ASSERT(mac_srs->srs_oth_soft_rings != NULL);
+	kmem_free(mac_srs->srs_oth_soft_rings,
+	    sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT);
+	mac_srs->srs_oth_soft_rings = NULL;
+}
+
+/*
+ * An RX SRS is attached to at most one mac_ring.
+ * A TX SRS  has no  rings.
+ */
+static void
+mac_srs_ring_free(mac_soft_ring_set_t *mac_srs)
+{
+	mac_client_impl_t	*mcip;
+	mac_ring_t		*ring;
+	flow_entry_t		*flent;
+
+	ring = mac_srs->srs_ring;
+	if (mac_srs->srs_type & SRST_TX) {
+		ASSERT(ring == NULL);
+		return;
+	}
+
+	if (ring == NULL)
+		return;
+
+	/*
+	 * Broadcast flows don't have a client impl association, but they
+	 * use only soft rings.
+	 */
+	flent = mac_srs->srs_flent;
+	mcip = flent->fe_mcip;
+	ASSERT(mcip != NULL);
+
+	ring->mr_classify_type = MAC_NO_CLASSIFIER;
+	ring->mr_srs = NULL;
+}
+
+/*
+ * Physical unlink and free of the data structures happen below. This is
+ * driven from mac_flow_destroy(), on the last refrele of a flow.
+ *
+ * Assumes Rx srs is 1-1 mapped with an ring.
+ */
+void
+mac_srs_free(mac_soft_ring_set_t *mac_srs)
+{
+	ASSERT(mac_srs->srs_mcip == NULL ||
+	    MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip));
+	ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE |
+	    SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE));
+
+	mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE);
+	mac_srs_ring_free(mac_srs);
+	mac_srs_soft_rings_free(mac_srs, B_TRUE);
+	mac_srs_fanout_list_free(mac_srs);
+
+	mac_srs->srs_bw = NULL;
+	kmem_cache_free(mac_srs_cache, mac_srs);
+}
+
+static void
+mac_srs_soft_rings_quiesce(mac_soft_ring_set_t *mac_srs, uint_t s_ring_flag)
+{
+	mac_soft_ring_t	*softring;
+
+	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
+
+	mac_srs_soft_rings_signal(mac_srs, s_ring_flag);
+	if (s_ring_flag == S_RING_CONDEMNED) {
+		while (mac_srs->srs_soft_ring_condemned_count !=
+		    mac_srs->srs_soft_ring_count)
+			cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+	} else {
+		while (mac_srs->srs_soft_ring_quiesced_count !=
+		    mac_srs->srs_soft_ring_count)
+			cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+	}
+	mutex_exit(&mac_srs->srs_lock);
+
+	for (softring = mac_srs->srs_soft_ring_head; softring != NULL;
+	    softring = softring->s_ring_next)
+		(void) untimeout(softring->s_ring_tid);
+
+	(void) untimeout(mac_srs->srs_tid);
+
+	mutex_enter(&mac_srs->srs_lock);
+}
+
+/*
+ * The block comment above mac_rx_classify_flow_state_change explains the
+ * background. At this point upcalls from the driver (both hardware classified
+ * and software classified) have been cut off. We now need to quiesce the
+ * SRS worker, poll, and softring threads. The SRS worker thread serves as
+ * the master controller. The steps involved are described below in the function
+ */
+void
+mac_srs_worker_quiesce(mac_soft_ring_set_t *mac_srs)
+{
+	uint_t			s_ring_flag;
+	uint_t			srs_poll_wait_flag;
+
+	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
+	ASSERT(mac_srs->srs_state & (SRS_CONDEMNED | SRS_QUIESCE));
+
+	if (mac_srs->srs_state & SRS_CONDEMNED) {
+		s_ring_flag = S_RING_CONDEMNED;
+		srs_poll_wait_flag = SRS_POLL_THR_EXITED;
+	} else {
+		s_ring_flag = S_RING_QUIESCE;
+		srs_poll_wait_flag = SRS_POLL_THR_QUIESCED;
+	}
+
+	/*
+	 * In the case of Rx SRS wait till the poll thread is done.
+	 */
+	if ((mac_srs->srs_type & SRST_TX) == 0 &&
+	    mac_srs->srs_poll_thr != NULL) {
+		while (!(mac_srs->srs_state & srs_poll_wait_flag))
+			cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+
+		/*
+		 * Turn off polling as part of the quiesce operation.
+		 */
+		MAC_SRS_POLLING_OFF(mac_srs);
+		mac_srs->srs_state &= ~(SRS_POLLING | SRS_GET_PKTS);
+	}
+
+	/*
+	 * Then signal the soft ring worker threads to quiesce or quit
+	 * as needed and then wait till that happens.
+	 */
+	mac_srs_soft_rings_quiesce(mac_srs, s_ring_flag);
+
+	if (mac_srs->srs_state & SRS_CONDEMNED)
+		mac_srs->srs_state |= (SRS_QUIESCE_DONE | SRS_CONDEMNED_DONE);
+	else
+		mac_srs->srs_state |= SRS_QUIESCE_DONE;
+	cv_signal(&mac_srs->srs_quiesce_done_cv);
+}
+
+/*
+ * Signal an SRS to start a temporary quiesce, or permanent removal, or restart
+ * a quiesced SRS by setting the appropriate flags and signaling the SRS worker
+ * or poll thread. This function is internal to the quiescing logic and is
+ * called internally from the SRS quiesce or flow quiesce or client quiesce
+ * higher level functions.
+ */
+void
+mac_srs_signal(mac_soft_ring_set_t *mac_srs, uint_t srs_flag)
+{
+	mac_ring_t	*ring;
+
+	ring = mac_srs->srs_ring;
+	ASSERT(ring == NULL || ring->mr_refcnt == 0);
+
+	if (srs_flag == SRS_CONDEMNED) {
+		/*
+		 * The SRS is going away. We need to unbind the SRS and SR
+		 * threads before removing from the global SRS list. Otherwise
+		 * there is a small window where the cpu reconfig callbacks
+		 * may miss the SRS in the list walk and DR could fail since
+		 * there are still bound threads.
+		 */
+		mac_srs_threads_unbind(mac_srs);
+		mac_srs_remove_glist(mac_srs);
+	}
+	/*
+	 * Wakeup the SRS worker and poll threads.
+	 */
+	mutex_enter(&mac_srs->srs_lock);
+	mac_srs->srs_state |= srs_flag;
+	cv_signal(&mac_srs->srs_async);
+	cv_signal(&mac_srs->srs_cv);
+	mutex_exit(&mac_srs->srs_lock);
+}
+
+/*
+ * In the Rx side, the quiescing is done bottom up. After the Rx upcalls
+ * from the driver are done, then the Rx SRS is quiesced and only then can
+ * we signal the soft rings. Thus this function can't be called arbitrarily
+ * without satisfying the prerequisites. On the Tx side, the threads from
+ * top need to quiesced, then the Tx SRS and only then can we signal the
+ * Tx soft rings.
+ */
+static void
+mac_srs_soft_rings_signal(mac_soft_ring_set_t *mac_srs, uint_t sr_flag)
+{
+	mac_soft_ring_t		*softring;
+
+	for (softring = mac_srs->srs_soft_ring_head; softring != NULL;
+	    softring = softring->s_ring_next)
+		mac_soft_ring_signal(softring, sr_flag);
+}
+
+/*
+ * The block comment above mac_rx_classify_flow_state_change explains the
+ * background. At this point the SRS is quiesced and we need to restart the
+ * SRS worker, poll, and softring threads. The SRS worker thread serves as
+ * the master controller. The steps involved are described below in the function
+ */
+void
+mac_srs_worker_restart(mac_soft_ring_set_t *mac_srs)
+{
+	boolean_t	iam_rx_srs;
+	mac_soft_ring_t	*softring;
+
+	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
+	if ((mac_srs->srs_type & SRST_TX) != 0) {
+		iam_rx_srs = B_FALSE;
+		ASSERT((mac_srs->srs_state &
+		    (SRS_POLL_THR_QUIESCED | SRS_QUIESCE_DONE | SRS_QUIESCE)) ==
+		    (SRS_QUIESCE_DONE | SRS_QUIESCE));
+	} else {
+		iam_rx_srs = B_TRUE;
+		ASSERT((mac_srs->srs_state &
+		    (SRS_QUIESCE_DONE | SRS_QUIESCE)) ==
+		    (SRS_QUIESCE_DONE | SRS_QUIESCE));
+		if (mac_srs->srs_poll_thr != NULL) {
+			ASSERT((mac_srs->srs_state & SRS_POLL_THR_QUIESCED) ==
+			    SRS_POLL_THR_QUIESCED);
+		}
+	}
+
+	/*
+	 * Signal any quiesced soft ring workers to restart and wait for the
+	 * soft ring down count to come down to zero.
+	 */
+	if (mac_srs->srs_soft_ring_quiesced_count != 0) {
+		for (softring = mac_srs->srs_soft_ring_head; softring != NULL;
+		    softring = softring->s_ring_next) {
+			if (!(softring->s_ring_state & S_RING_QUIESCE))
+				continue;
+			mac_soft_ring_signal(softring, S_RING_RESTART);
+		}
+		while (mac_srs->srs_soft_ring_quiesced_count != 0)
+			cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+	}
+
+	mac_srs->srs_state &= ~(SRS_QUIESCE_DONE | SRS_QUIESCE | SRS_RESTART);
+	if (iam_rx_srs && mac_srs->srs_poll_thr != NULL) {
+		/*
+		 * Signal the poll thread and ask it to restart. Wait till it
+		 * actually restarts and the SRS_POLL_THR_QUIESCED flag gets
+		 * cleared.
+		 */
+		mac_srs->srs_state |= SRS_POLL_THR_RESTART;
+		cv_signal(&mac_srs->srs_cv);
+		while (mac_srs->srs_state & SRS_POLL_THR_QUIESCED)
+			cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+		ASSERT(!(mac_srs->srs_state & SRS_POLL_THR_RESTART));
+	}
+	/* Wake up any waiter waiting for the restart to complete */
+	mac_srs->srs_state |= SRS_RESTART_DONE;
+	cv_signal(&mac_srs->srs_quiesce_done_cv);
+}
+
+static void
+mac_srs_worker_unbind(mac_soft_ring_set_t *mac_srs)
+{
+	mutex_enter(&mac_srs->srs_lock);
+	if (!(mac_srs->srs_state & SRS_WORKER_BOUND)) {
+		ASSERT(mac_srs->srs_worker_cpuid == -1);
+		mutex_exit(&mac_srs->srs_lock);
+		return;
+	}
+
+	mac_srs->srs_worker_cpuid = -1;
+	mac_srs->srs_state &= ~SRS_WORKER_BOUND;
+	thread_affinity_clear(mac_srs->srs_worker);
+	mutex_exit(&mac_srs->srs_lock);
+}
+
+static void
+mac_srs_poll_unbind(mac_soft_ring_set_t *mac_srs)
+{
+	mutex_enter(&mac_srs->srs_lock);
+	if (mac_srs->srs_poll_thr == NULL ||
+	    (mac_srs->srs_state & SRS_POLL_BOUND) == 0) {
+		ASSERT(mac_srs->srs_poll_cpuid == -1);
+		mutex_exit(&mac_srs->srs_lock);
+		return;
+	}
+
+	mac_srs->srs_poll_cpuid = -1;
+	mac_srs->srs_state &= ~SRS_POLL_BOUND;
+	thread_affinity_clear(mac_srs->srs_poll_thr);
+	mutex_exit(&mac_srs->srs_lock);
+}
+
+static void
+mac_srs_threads_unbind(mac_soft_ring_set_t *mac_srs)
+{
+	mac_soft_ring_t	*soft_ring;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip));
+
+	mutex_enter(&cpu_lock);
+	mac_srs_worker_unbind(mac_srs);
+	if (!(mac_srs->srs_type & SRST_TX))
+		mac_srs_poll_unbind(mac_srs);
+
+	for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL;
+	    soft_ring = soft_ring->s_ring_next) {
+		mac_soft_ring_unbind(soft_ring);
+	}
+	mutex_exit(&cpu_lock);
+}
+
+/*
+ * When a CPU is going away, unbind all MAC threads which are bound
+ * to that CPU. The affinity of the thread to the CPU is saved to allow
+ * the thread to be rebound to the CPU if it comes back online.
+ */
+static void
+mac_walk_srs_and_unbind(int cpuid)
+{
+	mac_soft_ring_set_t *mac_srs;
+	mac_soft_ring_t *soft_ring;
+
+	rw_enter(&mac_srs_g_lock, RW_READER);
+
+	if ((mac_srs = mac_srs_g_list) == NULL)
+		goto done;
+
+	for (; mac_srs != NULL; mac_srs = mac_srs->srs_next) {
+		if (mac_srs->srs_worker_cpuid == cpuid) {
+			mac_srs->srs_worker_cpuid_save = cpuid;
+			mac_srs_worker_unbind(mac_srs);
+		}
+
+		if (!(mac_srs->srs_type & SRST_TX)) {
+			if (mac_srs->srs_poll_cpuid == cpuid) {
+				mac_srs->srs_poll_cpuid_save = cpuid;
+				mac_srs_poll_unbind(mac_srs);
+			}
+		}
+
+		/* Next tackle the soft rings associated with the srs */
+		mutex_enter(&mac_srs->srs_lock);
+		for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL;
+		    soft_ring = soft_ring->s_ring_next) {
+			if (soft_ring->s_ring_cpuid == cpuid) {
+				soft_ring->s_ring_cpuid_save = cpuid;
+				mac_soft_ring_unbind(soft_ring);
+			}
+		}
+		mutex_exit(&mac_srs->srs_lock);
+	}
+done:
+	rw_exit(&mac_srs_g_lock);
+}
+
+/* TX SETUP and TEARDOWN ROUTINES */
+
+/*
+ * XXXHIO need to make sure the two mac_tx_srs_{add,del}_ring()
+ * handle the case where the number of rings is one. I.e. there is
+ * a ring pointed to by mac_srs->srs_tx_arg2.
+ */
+void
+mac_tx_srs_add_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring)
+{
+	mac_client_impl_t *mcip = mac_srs->srs_mcip;
+	mac_soft_ring_t *soft_ring;
+	int count = mac_srs->srs_oth_ring_count;
+
+	ASSERT(mac_srs->srs_state & SRS_QUIESCE);
+	soft_ring = mac_soft_ring_create(count, 0, NULL,
+	    (ST_RING_OTH | ST_RING_TX), maxclsyspri, mcip, mac_srs, -1,
+	    NULL, mcip, (mac_resource_handle_t)tx_ring);
+	mac_srs->srs_oth_ring_count++;
+	/*
+	 * put this soft ring in quiesce mode too so when we restart
+	 * all soft rings in the srs are in the same state.
+	 */
+	mac_soft_ring_signal(soft_ring, S_RING_QUIESCE);
+}
+
+static void
+mac_soft_ring_remove(mac_soft_ring_set_t *mac_srs, mac_soft_ring_t *softring)
+{
+	int sringcnt;
+
+	mutex_enter(&mac_srs->srs_lock);
+	sringcnt = mac_srs->srs_soft_ring_count;
+	ASSERT(sringcnt > 0);
+	mac_soft_ring_signal(softring, S_RING_CONDEMNED);
+
+	ASSERT(mac_srs->srs_soft_ring_condemned_count == 0);
+	while (mac_srs->srs_soft_ring_condemned_count != 1)
+		cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+
+	if (softring == mac_srs->srs_soft_ring_head) {
+		mac_srs->srs_soft_ring_head = softring->s_ring_next;
+		if (mac_srs->srs_soft_ring_head != NULL) {
+			mac_srs->srs_soft_ring_head->s_ring_prev = NULL;
+		} else {
+			mac_srs->srs_soft_ring_tail = NULL;
+		}
+	} else {
+		softring->s_ring_prev->s_ring_next =
+		    softring->s_ring_next;
+		if (softring->s_ring_next != NULL) {
+			softring->s_ring_next->s_ring_prev =
+			    softring->s_ring_prev;
+		} else {
+			mac_srs->srs_soft_ring_tail =
+			    softring->s_ring_prev;
+		}
+	}
+	mac_srs->srs_soft_ring_count--;
+
+	mac_srs->srs_soft_ring_condemned_count--;
+	mutex_exit(&mac_srs->srs_lock);
+
+	mac_soft_ring_free(softring, B_FALSE);
+}
+
+void
+mac_tx_srs_del_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring)
+{
+	int i;
+	mac_soft_ring_t *soft_ring, *remove_sring;
+
+	mutex_enter(&mac_srs->srs_lock);
+	for (i = 0; i < mac_srs->srs_oth_ring_count; i++) {
+		soft_ring =  mac_srs->srs_oth_soft_rings[i];
+		if (soft_ring->s_ring_tx_arg2 == tx_ring)
+			break;
+	}
+	mutex_exit(&mac_srs->srs_lock);
+	ASSERT(i < mac_srs->srs_oth_ring_count);
+	remove_sring = soft_ring;
+	mac_soft_ring_remove(mac_srs, remove_sring);
+	mac_srs_update_fanout_list(mac_srs);
+}
+
+/*
+ * mac_tx_srs_setup():
+ *
+ * Used to setup Tx rings. If no free Tx ring is available, then default
+ * Tx ring is used.
+ */
+void
+mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
+    uint32_t srs_type)
+{
+	mac_impl_t *mip = mcip->mci_mip;
+	mac_soft_ring_set_t *tx_srs;
+	int i, tx_ring_count = 0, tx_rings_reserved;
+	mac_ring_handle_t *tx_ring = NULL;
+	uint32_t soft_ring_type;
+	mac_group_t *grp = NULL;
+	mac_ring_t *ring;
+	mac_srs_tx_t *tx;
+	boolean_t serialize = B_FALSE;
+
+	tx_srs = flent->fe_tx_srs;
+	tx = &tx_srs->srs_tx;
+
+	if (tx->st_group != NULL) {
+		grp = tx->st_group;
+		tx_ring_count = grp->mrg_cur_count;
+	} else {
+		tx_ring_count = mac_tx_ring_count;
+	}
+
+	if (tx_ring_count != 0) {
+		tx_ring = kmem_zalloc(sizeof (mac_ring_handle_t) *
+		    tx_ring_count, KM_SLEEP);
+	}
+
+	/*
+	 * Just use the default ring for now. We need to use
+	 * the underlying link's ring set instead of the underlying
+	 * NIC's.
+	 */
+	if (srs_type == SRST_FLOW || mcip->mci_no_hwrings)
+		goto use_default_ring;
+
+	if (mcip->mci_share != NULL)
+		ring = grp->mrg_rings;
+	/*
+	 * An attempt is made to reserve 'tx_ring_count' number
+	 * of Tx rings. If tx_ring_count is 0, default Tx ring
+	 * is used. If it is 1, an attempt is made to reserve one
+	 * Tx ring. In both the cases, the ring information is
+	 * stored in Tx SRS. If multiple Tx rings are specified,
+	 * then each Tx ring will have a Tx-side soft ring. All
+	 * these soft rings will be hang off Tx SRS.
+	 */
+	for (i = 0, tx_rings_reserved = 0;
+	    i < tx_ring_count; i++, tx_rings_reserved++) {
+		if (mcip->mci_share != NULL) {
+			/*
+			 * The ring was already chosen and associated
+			 * with the TX group. Save it in the new
+			 * array to keep as much of the code below common
+			 * between the share and non-share cases.
+			 */
+			ASSERT(ring != NULL);
+			tx_ring[i] = (mac_ring_handle_t)ring;
+			ring = ring->mr_next;
+		} else {
+			tx_ring[i] =
+			    (mac_ring_handle_t)mac_reserve_tx_ring(mip, NULL);
+			if (tx_ring[i] == NULL)
+				break;
+		}
+	}
+	if (mac_tx_serialize || (mip->mi_v12n_level & MAC_VIRT_SERIALIZE))
+		serialize = B_TRUE;
+	/*
+	 * Did we get the requested number of tx rings?
+	 * There are 3 actions we can take depending upon the number
+	 * of tx_rings we got.
+	 * 1) If we got none, then hook up the tx_srs with the
+	 * default ring.
+	 * 2) If we got one, then get the tx_ring from the soft ring,
+	 * save it in SRS and free up the soft ring.
+	 * 3) If we got more than 1, then do the tx fanout among the
+	 * rings we obtained.
+	 */
+	switch (tx_rings_reserved) {
+	case 1:
+		/*
+		 * No need to allocate Tx soft rings. Tx-side soft
+		 * rings are for Tx fanout case. Just use Tx SRS.
+		 */
+		/* FALLTHRU */
+
+	case 0:
+use_default_ring:
+		if (tx_rings_reserved == 0)
+			tx->st_arg2 = (void *)mip->mi_default_tx_ring;
+		else
+			tx->st_arg2 = (void *)tx_ring[0];
+		/* For ring_count of 0 or 1, set the tx_mode and return */
+		if (tx_srs->srs_type & SRST_BW_CONTROL)
+			tx->st_mode = SRS_TX_BW;
+		else if (serialize)
+			tx->st_mode = SRS_TX_SERIALIZE;
+		else
+			tx->st_mode = SRS_TX_DEFAULT;
+		break;
+
+	default:
+		/*
+		 * We got multiple Tx rings for Tx fanout.
+		 *
+		 * cpuid of -1 is passed. This creates an unbound
+		 * worker thread. Instead the code should get CPU
+		 * binding information and pass that to
+		 * mac_soft_ring_create(). This needs to be done
+		 * in conjunction with Rx-side soft ring
+		 * bindings.
+		 */
+		soft_ring_type = ST_RING_OTH | ST_RING_TX;
+		if (tx_srs->srs_type & SRST_BW_CONTROL) {
+			tx->st_mode = SRS_TX_BW_FANOUT;
+		} else {
+			tx->st_mode = SRS_TX_FANOUT;
+			if (serialize)
+				soft_ring_type |= ST_RING_WORKER_ONLY;
+		}
+		for (i = 0; i < tx_rings_reserved; i++) {
+			(void) mac_soft_ring_create(i, 0, NULL, soft_ring_type,
+			    maxclsyspri, mcip, tx_srs, -1, NULL, mcip,
+			    (mac_resource_handle_t)tx_ring[i]);
+		}
+		mac_srs_update_fanout_list(tx_srs);
+	}
+	tx->st_func = mac_tx_get_func(tx->st_mode);
+
+	DTRACE_PROBE3(tx__srs___setup__return, mac_soft_ring_set_t *, tx_srs,
+	    int, tx->st_mode, int, tx_srs->srs_oth_ring_count);
+
+	if (tx_ring_count != 0) {
+		kmem_free(tx_ring,
+		    sizeof (mac_ring_handle_t) * tx_ring_count);
+	}
+}
+
+/*
+ * Walk through the list of mac clients for the MAC.
+ * For each active mac client, recompute the number of soft rings
+ * associated with every client, only if current speed is different
+ * from the speed that was previously used for soft ring computation.
+ * If the cable is disconnected whlie the NIC is started, we would get
+ * notification with speed set to 0. We do not recompute in that case.
+ */
+void
+mac_fanout_recompute(mac_impl_t *mip)
+{
+	mac_client_impl_t	*mcip;
+	uint64_t		ifspeed;
+	mac_resource_props_t	*mcip_mrp;
+
+	i_mac_perim_enter(mip);
+	ASSERT(!(mip->mi_state_flags & MIS_IS_VNIC));
+
+	if (mip->mi_linkstate != LINK_STATE_UP) {
+		i_mac_perim_exit(mip);
+		return;
+	}
+
+	for (mcip = mip->mi_clients_list; mcip != NULL;
+	    mcip = mcip->mci_client_next) {
+		if (!MCIP_DATAPATH_SETUP(mcip))
+			continue;
+
+		ifspeed = mac_client_stat_get(mcip->mci_flent->fe_mcip,
+		    MAC_STAT_IFSPEED);
+		if ((ifspeed != 0) &&
+		    (ifspeed != mcip->mci_flent->fe_nic_speed)) {
+			mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
+			mac_fanout_setup(mcip, mcip->mci_flent,
+			    mcip_mrp, mac_rx_deliver, mcip, NULL);
+		}
+	}
+	i_mac_perim_exit(mip);
+}
diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c
new file mode 100644
index 0000000000..f4c2113f61
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_flow.c
@@ -0,0 +1,2373 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/strsun.h>
+#include <sys/sdt.h>
+#include <sys/mac.h>
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/dls.h>
+#include <sys/dls_impl.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/ethernet.h>
+#include <sys/vlan.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <netinet/sctp.h>
+
+/* global flow table, will be a per exclusive-zone table later */
+static mod_hash_t	*flow_hash;
+static krwlock_t	flow_tab_lock;
+
+static kmem_cache_t	*flow_cache;
+static kmem_cache_t	*flow_tab_cache;
+static flow_ops_t	flow_l2_ops;
+
+typedef struct {
+	const char	*fs_name;
+	uint_t		fs_offset;
+} flow_stats_info_t;
+
+#define	FS_OFF(f)	(offsetof(flow_stats_t, f))
+static flow_stats_info_t flow_stats_list[] = {
+	{"rbytes",	FS_OFF(fs_rbytes)},
+	{"ipackets",	FS_OFF(fs_ipackets)},
+	{"ierrors",	FS_OFF(fs_ierrors)},
+	{"obytes",	FS_OFF(fs_obytes)},
+	{"opackets",	FS_OFF(fs_opackets)},
+	{"oerrors",	FS_OFF(fs_oerrors)}
+};
+#define	FS_SIZE		(sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
+
+/*
+ * Checks whether a flow mask is legal.
+ */
+static flow_tab_info_t	*mac_flow_tab_info_get(flow_mask_t);
+
+static void
+flow_stat_init(kstat_named_t *knp)
+{
+	int	i;
+
+	for (i = 0; i < FS_SIZE; i++, knp++) {
+		kstat_named_init(knp, flow_stats_list[i].fs_name,
+		    KSTAT_DATA_UINT64);
+	}
+}
+
+static int
+flow_stat_update(kstat_t *ksp, int rw)
+{
+	flow_entry_t		*fep = ksp->ks_private;
+	flow_stats_t 		*fsp = &fep->fe_flowstats;
+	kstat_named_t		*knp = ksp->ks_data;
+	uint64_t		*statp;
+	zoneid_t		zid;
+	int			i;
+
+	if (rw != KSTAT_READ)
+		return (EACCES);
+
+	zid = getzoneid();
+	if (zid != GLOBAL_ZONEID && zid != fep->fe_zoneid) {
+		for (i = 0; i < FS_SIZE; i++, knp++)
+			knp->value.ui64 = 0;
+
+		return (0);
+	}
+
+	for (i = 0; i < FS_SIZE; i++, knp++) {
+		statp = (uint64_t *)
+		    ((uchar_t *)fsp + flow_stats_list[i].fs_offset);
+
+		knp->value.ui64 = *statp;
+	}
+	return (0);
+}
+
+static void
+flow_stat_create(flow_entry_t *fep)
+{
+	kstat_t		*ksp;
+	kstat_named_t	*knp;
+	uint_t		nstats = FS_SIZE;
+
+	ksp = kstat_create("unix", 0, (char *)fep->fe_flow_name, "flow",
+	    KSTAT_TYPE_NAMED, nstats, 0);
+	if (ksp == NULL)
+		return;
+
+	ksp->ks_update = flow_stat_update;
+	ksp->ks_private = fep;
+	fep->fe_ksp = ksp;
+
+	knp = (kstat_named_t *)ksp->ks_data;
+	flow_stat_init(knp);
+	kstat_install(ksp);
+}
+
+void
+flow_stat_destroy(flow_entry_t *fep)
+{
+	if (fep->fe_ksp != NULL) {
+		kstat_delete(fep->fe_ksp);
+		fep->fe_ksp = NULL;
+	}
+}
+
+/*
+ * Initialize the flow table
+ */
+void
+mac_flow_init()
+{
+	flow_cache = kmem_cache_create("flow_entry_cache",
+	    sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	flow_tab_cache = kmem_cache_create("flow_tab_cache",
+	    sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	flow_hash = mod_hash_create_extended("flow_hash",
+	    100, mod_hash_null_keydtor, mod_hash_null_valdtor,
+	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
+	rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * Cleanup and release the flow table
+ */
+void
+mac_flow_fini()
+{
+	kmem_cache_destroy(flow_cache);
+	kmem_cache_destroy(flow_tab_cache);
+	mod_hash_destroy_hash(flow_hash);
+	rw_destroy(&flow_tab_lock);
+}
+
+/*
+ * mac_create_flow(): create a flow_entry_t.
+ */
+int
+mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
+    void *client_cookie, uint_t type, flow_entry_t **flentp)
+{
+	flow_entry_t	*flent = *flentp;
+	int		err = 0;
+
+	if (mrp != NULL) {
+		err = mac_validate_props(mrp);
+		if (err != 0)
+			return (err);
+	}
+
+	if (flent == NULL) {
+		flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
+		bzero(flent, sizeof (*flent));
+		mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
+		cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
+
+		/* Initialize the receiver function to a safe routine */
+		flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+		flent->fe_index = -1;
+	}
+	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAME);
+
+	/* This is an initial flow, will be configured later */
+	if (fd == NULL) {
+		*flentp = flent;
+		return (0);
+	}
+
+	flent->fe_client_cookie = client_cookie;
+	flent->fe_type = type;
+
+	/*
+	 * As flow creation is only allowed in global zone, this will
+	 * always set fe_zoneid to GLOBAL_ZONEID, and dls_add_flow() will
+	 * later set the right value.
+	 */
+	flent->fe_zoneid = getzoneid();
+
+	/* Save flow desc */
+	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
+
+	if (mrp != NULL) {
+		/*
+		 * We have already set fe_resource_props for a Link.
+		 */
+		if (type & FLOW_USER) {
+			bcopy(mrp, &flent->fe_resource_props,
+			    sizeof (mac_resource_props_t));
+		}
+		/*
+		 * The effective resource list should reflect the priority
+		 * that we set implicitly.
+		 */
+		if (!(mrp->mrp_mask & MRP_PRIORITY))
+			mrp->mrp_mask |= MRP_PRIORITY;
+		if (type & FLOW_USER)
+			mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
+		else
+			mrp->mrp_priority = MPL_LINK_DEFAULT;
+		bcopy(mrp, &flent->fe_effective_props,
+		    sizeof (mac_resource_props_t));
+	}
+	flow_stat_create(flent);
+
+	*flentp = flent;
+	return (0);
+}
+
+/*
+ * Validate flow entry and add it to a flow table.
+ */
+int
+mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
+{
+	flow_entry_t	**headp, **p;
+	flow_ops_t	*ops = &ft->ft_ops;
+	flow_mask_t	mask;
+	uint32_t	index;
+	int		err;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+
+	/*
+	 * Check for invalid bits in mask.
+	 */
+	mask = flent->fe_flow_desc.fd_mask;
+	if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
+		return (EOPNOTSUPP);
+
+	/*
+	 * Validate flent.
+	 */
+	if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
+		DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
+		    flow_entry_t *, flent, int, err);
+		return (err);
+	}
+
+	/*
+	 * Flent is valid. now calculate hash and insert it
+	 * into hash table.
+	 */
+	index = ops->fo_hash_fe(ft, flent);
+
+	/*
+	 * We do not need a lock up until now because we were
+	 * not accessing the flow table.
+	 */
+	rw_enter(&ft->ft_lock, RW_WRITER);
+	headp = &ft->ft_table[index];
+
+	/*
+	 * Check for duplicate flow.
+	 */
+	for (p = headp; *p != NULL; p = &(*p)->fe_next) {
+		if ((*p)->fe_flow_desc.fd_mask !=
+		    flent->fe_flow_desc.fd_mask)
+			continue;
+
+		if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
+			rw_exit(&ft->ft_lock);
+			DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
+			    flow_entry_t *, flent, int, err);
+			return (EALREADY);
+		}
+	}
+
+	/*
+	 * Insert flow to hash list.
+	 */
+	err = ops->fo_insert_fe(ft, headp, flent);
+	if (err != 0) {
+		rw_exit(&ft->ft_lock);
+		DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
+		    flow_entry_t *, flent, int, err);
+		return (err);
+	}
+
+	/*
+	 * Save the hash index so it can be used by mac_flow_remove().
+	 */
+	flent->fe_index = (int)index;
+
+	/*
+	 * Save the flow tab back reference.
+	 */
+	flent->fe_flow_tab = ft;
+	FLOW_MARK(flent, FE_FLOW_TAB);
+	ft->ft_flow_count++;
+	rw_exit(&ft->ft_lock);
+	return (0);
+}
+
+/*
+ * Remove a flow from a mac client's subflow table
+ */
+void
+mac_flow_rem_subflow(flow_entry_t *flent)
+{
+	flow_tab_t		*ft = flent->fe_flow_tab;
+	mac_client_impl_t	*mcip = ft->ft_mcip;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+
+	mac_flow_remove(ft, flent, B_FALSE);
+	if (flent->fe_mcip == NULL) {
+		/*
+		 * The interface is not yet plumbed and mac_client_flow_add
+		 * was not done.
+		 */
+		if (FLOW_TAB_EMPTY(ft)) {
+			mac_flow_tab_destroy(ft);
+			mcip->mci_subflow_tab = NULL;
+		}
+		return;
+	}
+	mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
+	mac_link_flow_clean((mac_client_handle_t)mcip, flent);
+}
+
+/*
+ * Add a flow to a mac client's subflow table and instantiate the flow
+ * in the mac by creating the associated SRSs etc.
+ */
+int
+mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
+    boolean_t instantiate_flow)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	flow_tab_info_t		*ftinfo;
+	flow_mask_t		mask;
+	flow_tab_t		*ft;
+	int			err;
+	boolean_t		ft_created = B_FALSE;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+
+	/*
+	 * If the subflow table exists already just add the new subflow
+	 * to the existing table, else we create a new subflow table below.
+	 */
+	ft = mcip->mci_subflow_tab;
+	if (ft == NULL) {
+		mask = flent->fe_flow_desc.fd_mask;
+		/*
+		 * Try to create a new table and then add the subflow to the
+		 * newly created subflow table
+		 */
+		if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL)
+			return (EOPNOTSUPP);
+
+		mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
+		    mcip->mci_mip, &ft);
+		ft_created = B_TRUE;
+	}
+
+	err = mac_flow_add(ft, flent);
+	if (err != 0) {
+		if (ft_created)
+			mac_flow_tab_destroy(ft);
+		return (err);
+	}
+
+	if (instantiate_flow) {
+		/* Now activate the flow by creating its SRSs */
+		ASSERT(MCIP_DATAPATH_SETUP(mcip));
+		err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
+		if (err != 0) {
+			mac_flow_remove(ft, flent, B_FALSE);
+			if (ft_created)
+				mac_flow_tab_destroy(ft);
+			return (err);
+		}
+	} else {
+		FLOW_MARK(flent, FE_UF_NO_DATAPATH);
+	}
+	if (ft_created) {
+		ASSERT(mcip->mci_subflow_tab == NULL);
+		ft->ft_mcip = mcip;
+		mcip->mci_subflow_tab = ft;
+		if (instantiate_flow)
+			mac_client_update_classifier(mcip, B_TRUE);
+	}
+	return (0);
+}
+
+/*
+ * Remove flow entry from flow table.
+ */
+void
+mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
+{
+	flow_entry_t	**fp;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+	if (!(flent->fe_flags & FE_FLOW_TAB))
+		return;
+
+	rw_enter(&ft->ft_lock, RW_WRITER);
+	/*
+	 * If this is a permanent removal from the flow table, mark it
+	 * CONDEMNED to prevent future references. If this is a temporary
+	 * removal from the table, say to update the flow descriptor then
+	 * we don't mark it CONDEMNED
+	 */
+	if (!temp)
+		FLOW_MARK(flent, FE_CONDEMNED);
+	/*
+	 * Locate the specified flent.
+	 */
+	fp = &ft->ft_table[flent->fe_index];
+	while (*fp != flent)
+		fp = &(*fp)->fe_next;
+
+	/*
+	 * The flent must exist. Otherwise it's a bug.
+	 */
+	ASSERT(fp != NULL);
+	*fp = flent->fe_next;
+	flent->fe_next = NULL;
+
+	/*
+	 * Reset fe_index to -1 so any attempt to call mac_flow_remove()
+	 * on a flent that is supposed to be in the table (FE_FLOW_TAB)
+	 * will panic.
+	 */
+	flent->fe_index = -1;
+	FLOW_UNMARK(flent, FE_FLOW_TAB);
+	ft->ft_flow_count--;
+	rw_exit(&ft->ft_lock);
+}
+
+/*
+ * This is the flow lookup routine used by the mac sw classifier engine.
+ */
+int
+mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
+{
+	flow_state_t	s;
+	flow_entry_t	*flent;
+	flow_ops_t	*ops = &ft->ft_ops;
+	boolean_t	retried = B_FALSE;
+	int		i, err;
+
+	s.fs_flags = flags;
+	s.fs_mp = mp;
+retry:
+
+	/*
+	 * Walk the list of predeclared accept functions.
+	 * Each of these would accumulate enough state to allow the next
+	 * accept routine to make progress.
+	 */
+	for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
+		if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
+			/*
+			 * ENOBUFS indicates that the mp could be too short
+			 * and may need a pullup.
+			 */
+			if (err != ENOBUFS || retried)
+				return (err);
+
+			/*
+			 * Don't modify the mblk if there are references to it.
+			 * Also, there is no point pulling up if b_cont is NULL.
+			 */
+			if (DB_REF(mp) > 1 || mp->b_cont == NULL ||
+			    pullupmsg(mp, -1) == 0)
+				return (EINVAL);
+
+			retried = B_TRUE;
+			DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
+			    flow_state_t *, &s);
+			goto retry;
+		}
+	}
+
+	/*
+	 * The packet is considered sane. We may now attempt to
+	 * find the corresponding flent.
+	 */
+	rw_enter(&ft->ft_lock, RW_READER);
+	flent = ft->ft_table[ops->fo_hash(ft, &s)];
+	for (; flent != NULL; flent = flent->fe_next) {
+		if (flent->fe_match(ft, flent, &s)) {
+			FLOW_TRY_REFHOLD(flent, err);
+			if (err != 0)
+				continue;
+			*flentp = flent;
+			rw_exit(&ft->ft_lock);
+			return (0);
+		}
+	}
+	rw_exit(&ft->ft_lock);
+	return (ENOENT);
+}
+
+/*
+ * Walk flow table.
+ * The caller is assumed to have proper perimeter protection.
+ */
+int
+mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
+    void *arg)
+{
+	int		err, i, cnt = 0;
+	flow_entry_t	*flent;
+
+	if (ft == NULL)
+		return (0);
+
+	for (i = 0; i < ft->ft_size; i++) {
+		for (flent = ft->ft_table[i]; flent != NULL;
+		    flent = flent->fe_next) {
+			cnt++;
+			err = (*fn)(flent, arg);
+			if (err != 0)
+				return (err);
+		}
+	}
+	VERIFY(cnt == ft->ft_flow_count);
+	return (0);
+}
+
+/*
+ * Same as the above except a mutex is used for protection here.
+ */
+int
+mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
+    void *arg)
+{
+	int		err;
+
+	if (ft == NULL)
+		return (0);
+
+	rw_enter(&ft->ft_lock, RW_WRITER);
+	err = mac_flow_walk_nolock(ft, fn, arg);
+	rw_exit(&ft->ft_lock);
+	return (err);
+}
+
+static boolean_t	mac_flow_clean(flow_entry_t *);
+
+/*
+ * Destroy a flow entry. Called when the last reference on a flow is released.
+ */
+void
+mac_flow_destroy(flow_entry_t *flent)
+{
+	ASSERT(flent->fe_refcnt == 0);
+
+	if ((flent->fe_type & FLOW_USER) != 0) {
+		ASSERT(mac_flow_clean(flent));
+	} else {
+		mac_flow_cleanup(flent);
+	}
+
+	mutex_destroy(&flent->fe_lock);
+	cv_destroy(&flent->fe_cv);
+	flow_stat_destroy(flent);
+	kmem_cache_free(flow_cache, flent);
+}
+
+/*
+ * XXX eric
+ * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
+ * mac_link_flow_modify() should really be moved/reworked into the
+ * two functions below. This would consolidate all the mac property
+ * checking in one place. I'm leaving this alone for now since it's
+ * out of scope of the new flows work.
+ */
+/* ARGSUSED */
+uint32_t
+mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
+{
+	uint32_t		changed_mask = 0;
+	mac_resource_props_t	*fmrp = &flent->fe_effective_props;
+	int			i;
+
+	if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
+	    (fmrp->mrp_maxbw != mrp->mrp_maxbw)) {
+		changed_mask |= MRP_MAXBW;
+		fmrp->mrp_maxbw = mrp->mrp_maxbw;
+		if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
+			fmrp->mrp_mask &= ~MRP_MAXBW;
+		} else {
+			fmrp->mrp_mask |= MRP_MAXBW;
+		}
+	}
+
+	if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
+		if (fmrp->mrp_priority != mrp->mrp_priority)
+			changed_mask |= MRP_PRIORITY;
+		if (mrp->mrp_priority == MPL_RESET) {
+			fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
+			fmrp->mrp_mask &= ~MRP_PRIORITY;
+		} else {
+			fmrp->mrp_priority = mrp->mrp_priority;
+			fmrp->mrp_mask |= MRP_PRIORITY;
+		}
+	}
+
+	/* modify fanout */
+	if ((mrp->mrp_mask & MRP_CPUS) != 0) {
+		if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
+		    (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
+			for (i = 0; i < mrp->mrp_ncpus; i++) {
+				if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
+					break;
+			}
+			if (i == mrp->mrp_ncpus) {
+				/*
+				 * The new set of cpus passed is exactly
+				 * the same as the existing set.
+				 */
+				return (changed_mask);
+			}
+		}
+		changed_mask |= MRP_CPUS;
+		MAC_COPY_CPUS(mrp, fmrp);
+	}
+	return (changed_mask);
+}
+
+void
+mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
+{
+	uint32_t changed_mask;
+	mac_client_impl_t *mcip = flent->fe_mcip;
+	mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
+
+	ASSERT(flent != NULL);
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+
+	rw_enter(&ft->ft_lock, RW_WRITER);
+
+	/* Update the cached values inside the subflow entry */
+	changed_mask = mac_flow_modify_props(flent, mrp);
+	rw_exit(&ft->ft_lock);
+	/*
+	 * Push the changed parameters to the scheduling code in the
+	 * SRS's, to take effect right away.
+	 */
+	if (changed_mask & MRP_MAXBW) {
+		mac_srs_update_bwlimit(flent, mrp);
+		/*
+		 * If bandwidth is changed, we may have to change
+		 * the number of soft ring to be used for fanout.
+		 * Call mac_flow_update_fanout() if MAC_BIND_CPU
+		 * is not set and there is no user supplied cpu
+		 * info. This applies only to link at this time.
+		 */
+		if (!(flent->fe_type & FLOW_USER) &&
+		    !(changed_mask & MRP_CPUS) &&
+		    !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
+			mac_fanout_setup(mcip, flent, mcip_mrp,
+			    mac_rx_deliver, mcip, NULL);
+		}
+	}
+	if (mrp->mrp_mask & MRP_PRIORITY)
+		mac_flow_update_priority(mcip, flent);
+
+	if (changed_mask & MRP_CPUS)
+		mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL);
+}
+
+/*
+ * This function waits for a certain condition to be met and is generally
+ * used before a destructive or quiescing operation.
+ */
+void
+mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
+{
+	mutex_enter(&flent->fe_lock);
+	flent->fe_flags |= FE_WAITER;
+
+	switch (event) {
+	case FLOW_DRIVER_UPCALL:
+		/*
+		 * We want to make sure the driver upcalls have finished before
+		 * we signal the Rx SRS worker to quit.
+		 */
+		while (flent->fe_refcnt != 1)
+			cv_wait(&flent->fe_cv, &flent->fe_lock);
+		break;
+
+	case FLOW_USER_REF:
+		/*
+		 * Wait for the fe_user_refcnt to drop to 0. The flow has
+		 * been removed from the global flow hash.
+		 */
+		ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
+		while (flent->fe_user_refcnt != 0)
+			cv_wait(&flent->fe_cv, &flent->fe_lock);
+		break;
+
+	default:
+		ASSERT(0);
+	}
+
+	flent->fe_flags &= ~FE_WAITER;
+	mutex_exit(&flent->fe_lock);
+}
+
+static boolean_t
+mac_flow_clean(flow_entry_t *flent)
+{
+	ASSERT(flent->fe_next == NULL);
+	ASSERT(flent->fe_tx_srs == NULL);
+	ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
+	ASSERT(flent->fe_mbg == NULL);
+
+	return (B_TRUE);
+}
+
+void
+mac_flow_cleanup(flow_entry_t *flent)
+{
+	if ((flent->fe_type & FLOW_USER) == 0) {
+		ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
+		    (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
+		ASSERT(flent->fe_refcnt == 0);
+	} else {
+		ASSERT(flent->fe_refcnt == 1);
+	}
+
+	if (flent->fe_mbg != NULL) {
+		ASSERT(flent->fe_tx_srs == NULL);
+		/* This is a multicast or broadcast flow entry */
+		mac_bcast_grp_free(flent->fe_mbg);
+		flent->fe_mbg = NULL;
+	}
+
+	if (flent->fe_tx_srs != NULL) {
+		ASSERT(flent->fe_mbg == NULL);
+		mac_srs_free(flent->fe_tx_srs);
+		flent->fe_tx_srs = NULL;
+	}
+
+	/*
+	 * In the normal case fe_rx_srs_cnt is 1. However in the error case
+	 * when mac_unicast_add fails we may not have set up any SRS
+	 * in which case fe_rx_srs_cnt will be zero.
+	 */
+	if (flent->fe_rx_srs_cnt != 0) {
+		ASSERT(flent->fe_rx_srs_cnt == 1);
+		mac_srs_free(flent->fe_rx_srs[0]);
+		flent->fe_rx_srs[0] = NULL;
+		flent->fe_rx_srs_cnt = 0;
+	}
+	ASSERT(flent->fe_rx_srs[0] == NULL);
+}
+
+void
+mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
+{
+	/*
+	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
+	 * Updates to the fe_flow_desc happen under the fe_lock
+	 * after removing the flent from the flow table
+	 */
+	mutex_enter(&flent->fe_lock);
+	bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
+	mutex_exit(&flent->fe_lock);
+}
+
+/*
+ * Update a field of a flow entry. The mac perimeter ensures that
+ * this is the only thread doing a modify operation on this mac end point.
+ * So the flow table can't change or disappear. The ft_lock protects access
+ * to the flow entry, and holding the lock ensures that there isn't any thread
+ * accessing the flow entry or attempting a flow table lookup. However
+ * data threads that are using the flow entry based on the old descriptor
+ * will continue to use the flow entry. If strong coherence is required
+ * then the flow will have to be quiesced before the descriptor can be
+ * changed.
+ */
+void
+mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
+{
+	flow_tab_t	*ft = flent->fe_flow_tab;
+	flow_desc_t	old_desc;
+	int		err;
+
+	if (ft == NULL) {
+		/*
+		 * The flow hasn't yet been inserted into the table,
+		 * so only the caller knows about this flow, however for
+		 * uniformity we grab the fe_lock here.
+		 */
+		mutex_enter(&flent->fe_lock);
+		bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
+		mutex_exit(&flent->fe_lock);
+	}
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+
+	/*
+	 * Need to remove the flow entry from the table and reinsert it,
+	 * into a potentially diference hash line. The hash depends on
+	 * the new descriptor fields. However access to fe_desc itself
+	 * is always under the fe_lock. This helps log and stat functions
+	 * see a self-consistent fe_flow_desc.
+	 */
+	mac_flow_remove(ft, flent, B_TRUE);
+	old_desc = flent->fe_flow_desc;
+
+	mutex_enter(&flent->fe_lock);
+	bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
+	mutex_exit(&flent->fe_lock);
+
+	if (mac_flow_add(ft, flent) != 0) {
+		/*
+		 * The add failed say due to an invalid flow descriptor.
+		 * Undo the update
+		 */
+		flent->fe_flow_desc = old_desc;
+		err = mac_flow_add(ft, flent);
+		ASSERT(err == 0);
+	}
+}
+
+void
+mac_flow_set_name(flow_entry_t *flent, const char *name)
+{
+	flow_tab_t	*ft = flent->fe_flow_tab;
+
+	if (ft == NULL) {
+		/*
+		 *  The flow hasn't yet been inserted into the table,
+		 * so only the caller knows about this flow
+		 */
+		(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAME);
+	} else {
+		ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+	}
+
+	mutex_enter(&flent->fe_lock);
+	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAME);
+	mutex_exit(&flent->fe_lock);
+}
+
+/*
+ * Return the client-private cookie that was associated with
+ * the flow when it was created.
+ */
+void *
+mac_flow_get_client_cookie(flow_entry_t *flent)
+{
+	return (flent->fe_client_cookie);
+}
+
+/*
+ * Forward declarations.
+ */
+static uint32_t	flow_l2_hash(flow_tab_t *, flow_state_t *);
+static int	flow_l2_accept(flow_tab_t *, flow_state_t *);
+static uint32_t	flow_ether_hash(flow_tab_t *, flow_state_t *);
+static int	flow_ether_accept(flow_tab_t *, flow_state_t *);
+
+/*
+ * Create flow table.
+ */
+void
+mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
+    mac_impl_t *mip, flow_tab_t **ftp)
+{
+	flow_tab_t	*ft;
+	flow_ops_t	*new_ops;
+
+	ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
+	bzero(ft, sizeof (*ft));
+
+	ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
+
+	/*
+	 * We make a copy of the ops vector instead of just pointing to it
+	 * because we might want to customize the ops vector on a per table
+	 * basis (e.g. for optimization).
+	 */
+	new_ops = &ft->ft_ops;
+	bcopy(ops, new_ops, sizeof (*ops));
+	ft->ft_mask = mask;
+	ft->ft_size = size;
+	ft->ft_mip = mip;
+
+	/*
+	 * Optimization for DL_ETHER media.
+	 */
+	if (mip->mi_info.mi_nativemedia == DL_ETHER) {
+		if (new_ops->fo_hash == flow_l2_hash)
+			new_ops->fo_hash = flow_ether_hash;
+
+		if (new_ops->fo_accept[0] == flow_l2_accept)
+			new_ops->fo_accept[0] = flow_ether_accept;
+
+	}
+	*ftp = ft;
+}
+
+void
+mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
+{
+	mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
+	    1024, mip, ftp);
+}
+
+/*
+ * Destroy flow table.
+ */
+void
+mac_flow_tab_destroy(flow_tab_t *ft)
+{
+	if (ft == NULL)
+		return;
+
+	ASSERT(ft->ft_flow_count == 0);
+	kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
+	bzero(ft, sizeof (*ft));
+	kmem_cache_free(flow_tab_cache, ft);
+}
+
+/*
+ * Add a new flow entry to the global flow hash table
+ */
+int
+mac_flow_hash_add(flow_entry_t *flent)
+{
+	int	err;
+
+	rw_enter(&flow_tab_lock, RW_WRITER);
+	err = mod_hash_insert(flow_hash,
+	    (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
+	if (err != 0) {
+		rw_exit(&flow_tab_lock);
+		return (EEXIST);
+	}
+	/* Mark as inserted into the global flow hash table */
+	FLOW_MARK(flent, FE_G_FLOW_HASH);
+	rw_exit(&flow_tab_lock);
+	return (err);
+}
+
+/*
+ * Remove a flow entry from the global flow hash table
+ */
+void
+mac_flow_hash_remove(flow_entry_t *flent)
+{
+	mod_hash_val_t	val;
+
+	rw_enter(&flow_tab_lock, RW_WRITER);
+	VERIFY(mod_hash_remove(flow_hash,
+	    (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
+
+	/* Clear the mark that says inserted into the global flow hash table */
+	FLOW_UNMARK(flent, FE_G_FLOW_HASH);
+	rw_exit(&flow_tab_lock);
+}
+
+/*
+ * Retrieve a flow entry from the global flow hash table.
+ */
+int
+mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
+{
+	int		err;
+	flow_entry_t	*flent;
+
+	rw_enter(&flow_tab_lock, RW_READER);
+	err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
+	    (mod_hash_val_t *)&flent);
+	if (err != 0) {
+		rw_exit(&flow_tab_lock);
+		return (ENOENT);
+	}
+	ASSERT(flent != NULL);
+	FLOW_USER_REFHOLD(flent);
+	rw_exit(&flow_tab_lock);
+
+	*flentp = flent;
+	return (0);
+}
+
+/*
+ * Initialize or release mac client flows by walking the subflow table.
+ * These are typically invoked during plumb/unplumb of links.
+ */
+
+static int
+mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
+{
+	mac_client_impl_t	*mcip = arg;
+
+	if (mac_link_flow_init(arg, flent) != 0) {
+		cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
+		    flent->fe_flow_name, mcip->mci_name);
+	} else {
+		FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
+	}
+	return (0);
+}
+
+void
+mac_link_init_flows(mac_client_handle_t mch)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+
+	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
+	    mac_link_init_flows_cb, mcip);
+	/*
+	 * If mac client had subflow(s) configured before plumb, change
+	 * function to mac_rx_srs_subflow_process and in case of hardware
+	 * classification, disable polling.
+	 */
+	mac_client_update_classifier(mcip, B_TRUE);
+
+}
+
+boolean_t
+mac_link_has_flows(mac_client_handle_t mch)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+
+	if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+static int
+mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
+{
+	FLOW_MARK(flent, FE_UF_NO_DATAPATH);
+	mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
+	mac_link_flow_clean(arg, flent);
+	return (0);
+}
+
+void
+mac_link_release_flows(mac_client_handle_t mch)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+
+	/*
+	 * Change the mci_flent callback back to mac_rx_srs_process()
+	 * because flows are about to be deactivated.
+	 */
+	mac_client_update_classifier(mcip, B_FALSE);
+	(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
+	    mac_link_release_flows_cb, mcip);
+}
+
+void
+mac_rename_flow(flow_entry_t *fep, const char *new_name)
+{
+	mac_flow_set_name(fep, new_name);
+	if (fep->fe_ksp != NULL) {
+		flow_stat_destroy(fep);
+		flow_stat_create(fep);
+	}
+}
+
+/*
+ * mac_link_flow_init()
+ * Internal flow interface used for allocating SRSs and related
+ * data structures. Not meant to be used by mac clients.
+ */
+int
+mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
+{
+	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
+	mac_impl_t		*mip = mcip->mci_mip;
+	int			err;
+
+	ASSERT(mch != NULL);
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
+		return (err);
+
+	sub_flow->fe_mcip = mcip;
+
+	return (0);
+}
+
+/*
+ * mac_link_flow_add()
+ * Used by flowadm(1m) or kernel mac clients for creating flows.
+ */
+int
+mac_link_flow_add(datalink_id_t linkid, char *flow_name,
+    flow_desc_t *flow_desc, mac_resource_props_t *mrp)
+{
+	flow_entry_t		*flent = NULL;
+	int			err;
+	dls_dl_handle_t		dlh;
+	dls_link_t		*dlp;
+	boolean_t		link_held = B_FALSE;
+	boolean_t		hash_added = B_FALSE;
+	mac_perim_handle_t	mph;
+
+	err = mac_flow_lookup_byname(flow_name, &flent);
+	if (err == 0) {
+		FLOW_USER_REFRELE(flent);
+		return (EEXIST);
+	}
+
+	/*
+	 * First create a flow entry given the description provided
+	 * by the caller.
+	 */
+	err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
+	    FLOW_USER | FLOW_OTHER, &flent);
+
+	if (err != 0)
+		return (err);
+
+	/*
+	 * We've got a local variable referencing this flow now, so we need
+	 * to hold it. We'll release this flow before returning.
+	 * All failures until we return will undo any action that may internally
+	 * held the flow, so the last REFRELE will assure a clean freeing
+	 * of resources.
+	 */
+	FLOW_REFHOLD(flent);
+
+	flent->fe_link_id = linkid;
+	FLOW_MARK(flent, FE_INCIPIENT);
+
+	err = mac_perim_enter_by_linkid(linkid, &mph);
+	if (err != 0) {
+		FLOW_FINAL_REFRELE(flent);
+		return (err);
+	}
+
+	/*
+	 * dls will eventually be merged with mac so it's ok
+	 * to call dls' internal functions.
+	 */
+	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
+	if (err != 0)
+		goto bail;
+
+	link_held = B_TRUE;
+
+	/*
+	 * Add the flow to the global flow table, this table will be per
+	 * exclusive zone so each zone can have its own flow namespace.
+	 * RFE 6625651 will fix this.
+	 *
+	 */
+	if ((err = mac_flow_hash_add(flent)) != 0)
+		goto bail;
+
+	hash_added = B_TRUE;
+
+	/*
+	 * do not allow flows to be configured on an anchor VNIC
+	 */
+	if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
+		err = ENOTSUP;
+		goto bail;
+	}
+
+	/*
+	 * Save the zoneid of the underlying link in the flow entry,
+	 * this is needed to prevent non-global zone from getting
+	 * statistics information of global zone.
+	 */
+	flent->fe_zoneid = dlp->dl_zid;
+
+	/*
+	 * Add the subflow to the subflow table. Also instantiate the flow
+	 * in the mac if there is an active DLS user. The dl_mah is set when
+	 * dls_active_set() is called, typically during interface plumb.
+	 */
+	err = mac_flow_add_subflow(dlp->dl_mch, flent, dlp->dl_mah != NULL);
+	if (err != 0)
+		goto bail;
+
+	FLOW_UNMARK(flent, FE_INCIPIENT);
+	dls_devnet_rele_link(dlh, dlp);
+	mac_perim_exit(mph);
+	return (0);
+
+bail:
+	if (hash_added)
+		mac_flow_hash_remove(flent);
+
+	if (link_held)
+		dls_devnet_rele_link(dlh, dlp);
+
+	/*
+	 * Wait for any transient global flow hash refs to clear
+	 * and then release the creation reference on the flow
+	 */
+	mac_flow_wait(flent, FLOW_USER_REF);
+	FLOW_FINAL_REFRELE(flent);
+	mac_perim_exit(mph);
+	return (err);
+}
+
+/*
+ * mac_link_flow_clean()
+ * Internal flow interface used for freeing SRSs and related
+ * data structures. Not meant to be used by mac clients.
+ */
+void
+mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
+{
+	mac_client_impl_t 	*mcip = (mac_client_impl_t *)mch;
+	mac_impl_t		*mip = mcip->mci_mip;
+	boolean_t		last_subflow;
+
+	ASSERT(mch != NULL);
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	/*
+	 * This sub flow entry may fail to be fully initialized by
+	 * mac_link_flow_init(). If so, simply return.
+	 */
+	if (sub_flow->fe_mcip == NULL)
+		return;
+
+	last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
+	/*
+	 * Tear down the data path
+	 */
+	mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
+	sub_flow->fe_mcip = NULL;
+
+	/*
+	 * Delete the SRSs associated with this subflow. If this is being
+	 * driven by flowadm(1M) then the subflow will be deleted by
+	 * dls_rem_flow. However if this is a result of the interface being
+	 * unplumbed then the subflow itself won't be deleted.
+	 */
+	mac_flow_cleanup(sub_flow);
+
+	/*
+	 * If all the subflows are gone, renable some of the stuff
+	 * we disabled when adding a subflow, polling etc.
+	 */
+	if (last_subflow) {
+		/*
+		 * The subflow table itself is not protected by any locks or
+		 * refcnts. Hence quiesce the client upfront before clearing
+		 * mci_subflow_tab.
+		 */
+		mac_client_quiesce(mcip);
+		mac_client_update_classifier(mcip, B_FALSE);
+		mac_flow_tab_destroy(mcip->mci_subflow_tab);
+		mcip->mci_subflow_tab = NULL;
+		mac_client_restart(mcip);
+	}
+}
+
+/*
+ * mac_link_flow_remove()
+ * Used by flowadm(1m) or kernel mac clients for removing flows.
+ */
+int
+mac_link_flow_remove(char *flow_name)
+{
+	flow_entry_t		*flent;
+	mac_perim_handle_t	mph;
+	int			err;
+	datalink_id_t		linkid;
+
+	err = mac_flow_lookup_byname(flow_name, &flent);
+	if (err != 0)
+		return (err);
+
+	linkid = flent->fe_link_id;
+	FLOW_USER_REFRELE(flent);
+
+	/*
+	 * The perim must be acquired before acquiring any other references
+	 * to maintain the lock and perimeter hierarchy. Please note the
+	 * FLOW_REFRELE above.
+	 */
+	err = mac_perim_enter_by_linkid(linkid, &mph);
+	if (err != 0)
+		return (err);
+
+	/*
+	 * Note the second lookup of the flow, because a concurrent thread
+	 * may have removed it already while we were waiting to enter the
+	 * link's perimeter.
+	 */
+	err = mac_flow_lookup_byname(flow_name, &flent);
+	if (err != 0) {
+		mac_perim_exit(mph);
+		return (err);
+	}
+	FLOW_USER_REFRELE(flent);
+
+	/*
+	 * Remove the flow from the subflow table and deactivate the flow
+	 * by quiescing and removings its SRSs
+	 */
+	mac_flow_rem_subflow(flent);
+
+	/*
+	 * Finally, remove the flow from the global table.
+	 */
+	mac_flow_hash_remove(flent);
+
+	/*
+	 * Wait for any transient global flow hash refs to clear
+	 * and then release the creation reference on the flow
+	 */
+	mac_flow_wait(flent, FLOW_USER_REF);
+	FLOW_FINAL_REFRELE(flent);
+
+	mac_perim_exit(mph);
+
+	return (0);
+}
+
+/*
+ * mac_link_flow_modify()
+ * Modifies the properties of a flow identified by its name.
+ */
+int
+mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
+{
+	flow_entry_t		*flent;
+	mac_client_impl_t 	*mcip;
+	int			err = 0;
+	mac_perim_handle_t	mph;
+	datalink_id_t		linkid;
+	flow_tab_t		*flow_tab;
+
+	err = mac_validate_props(mrp);
+	if (err != 0)
+		return (err);
+
+	err = mac_flow_lookup_byname(flow_name, &flent);
+	if (err != 0)
+		return (err);
+
+	linkid = flent->fe_link_id;
+	FLOW_USER_REFRELE(flent);
+
+	/*
+	 * The perim must be acquired before acquiring any other references
+	 * to maintain the lock and perimeter hierarchy. Please note the
+	 * FLOW_REFRELE above.
+	 */
+	err = mac_perim_enter_by_linkid(linkid, &mph);
+	if (err != 0)
+		return (err);
+
+	/*
+	 * Note the second lookup of the flow, because a concurrent thread
+	 * may have removed it already while we were waiting to enter the
+	 * link's perimeter.
+	 */
+	err = mac_flow_lookup_byname(flow_name, &flent);
+	if (err != 0) {
+		mac_perim_exit(mph);
+		return (err);
+	}
+	FLOW_USER_REFRELE(flent);
+
+	/*
+	 * If this flow is attached to a MAC client, then pass the request
+	 * along to the client.
+	 * Otherwise, just update the cached values.
+	 */
+	mcip = flent->fe_mcip;
+	mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
+	if (mcip != NULL) {
+		if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
+			err = ENOENT;
+		} else {
+			mac_flow_modify(flow_tab, flent, mrp);
+		}
+	} else {
+		(void) mac_flow_modify_props(flent, mrp);
+	}
+
+done:
+	mac_perim_exit(mph);
+	return (err);
+}
+
+
+/*
+ * State structure and misc functions used by mac_link_flow_walk().
+ */
+typedef struct {
+	int	(*ws_func)(mac_flowinfo_t *, void *);
+	void	*ws_arg;
+} flow_walk_state_t;
+
+static void
+mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
+{
+	(void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name, MAXNAMELEN);
+	finfop->fi_link_id = flent->fe_link_id;
+	finfop->fi_flow_desc = flent->fe_flow_desc;
+	finfop->fi_resource_props = flent->fe_resource_props;
+}
+
+static int
+mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
+{
+	flow_walk_state_t	*statep = arg;
+	mac_flowinfo_t		finfo;
+
+	mac_link_flowinfo_copy(&finfo, flent);
+	return (statep->ws_func(&finfo, statep->ws_arg));
+}
+
+/*
+ * mac_link_flow_walk()
+ * Invokes callback 'func' for all flows belonging to the specified link.
+ */
+int
+mac_link_flow_walk(datalink_id_t linkid,
+    int (*func)(mac_flowinfo_t *, void *), void *arg)
+{
+	mac_client_impl_t	*mcip;
+	mac_perim_handle_t	mph;
+	flow_walk_state_t	state;
+	dls_dl_handle_t		dlh;
+	dls_link_t		*dlp;
+	int			err;
+
+	err = mac_perim_enter_by_linkid(linkid, &mph);
+	if (err != 0)
+		return (err);
+
+	err = dls_devnet_hold_link(linkid, &dlh, &dlp);
+	if (err != 0) {
+		mac_perim_exit(mph);
+		return (err);
+	}
+
+	mcip = (mac_client_impl_t *)dlp->dl_mch;
+	state.ws_func = func;
+	state.ws_arg = arg;
+
+	err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
+	    mac_link_flow_walk_cb, &state);
+
+	dls_devnet_rele_link(dlh, dlp);
+	mac_perim_exit(mph);
+	return (err);
+}
+
+/*
+ * mac_link_flow_info()
+ * Retrieves information about a specific flow.
+ */
+int
+mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
+{
+	flow_entry_t	*flent;
+	int		err;
+
+	err = mac_flow_lookup_byname(flow_name, &flent);
+	if (err != 0)
+		return (err);
+
+	mac_link_flowinfo_copy(finfo, flent);
+	FLOW_USER_REFRELE(flent);
+	return (0);
+}
+
+#define	HASH_MAC_VID(a, v, s) \
+	((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
+
+#define	PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
+
+/* ARGSUSED */
+static boolean_t
+flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+	flow_l2info_t		*l2 = &s->fs_l2info;
+	flow_desc_t		*fd = &flent->fe_flow_desc;
+
+	return (l2->l2_vid == fd->fd_vid &&
+	    bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
+}
+
+/*
+ * Layer 2 hash function.
+ * Must be paired with flow_l2_accept() within a set of flow_ops
+ * because it assumes the dest address is already extracted.
+ */
+static uint32_t
+flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
+{
+	flow_l2info_t		*l2 = &s->fs_l2info;
+
+	return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
+}
+
+/*
+ * This is the generic layer 2 accept function.
+ * It makes use of mac_header_info() to extract the header length,
+ * sap, vlan ID and destination address.
+ */
+static int
+flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
+{
+	boolean_t		is_ether;
+	flow_l2info_t		*l2 = &s->fs_l2info;
+	mac_header_info_t	mhi;
+	int			err;
+
+	is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
+	if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
+	    s->fs_mp, &mhi)) != 0) {
+		if (err == EINVAL)
+			err = ENOBUFS;
+
+		return (err);
+	}
+
+	l2->l2_start = s->fs_mp->b_rptr;
+	l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
+
+	if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
+	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
+		struct ether_vlan_header	*evhp =
+		    (struct ether_vlan_header *)l2->l2_start;
+
+		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
+			return (ENOBUFS);
+
+		l2->l2_sap = ntohs(evhp->ether_type);
+		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
+		l2->l2_hdrsize = sizeof (*evhp);
+	} else {
+		l2->l2_sap = mhi.mhi_bindsap;
+		l2->l2_vid = 0;
+		l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
+	}
+	return (0);
+}
+
+/*
+ * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
+ * accept(). The notable difference is that dest address is now extracted
+ * by hash() rather than by accept(). This saves a few memory references
+ * for flow tables that do not care about mac addresses.
+ */
+static uint32_t
+flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
+{
+	flow_l2info_t			*l2 = &s->fs_l2info;
+	struct ether_vlan_header	*evhp;
+
+	evhp = (struct ether_vlan_header *)l2->l2_start;
+	l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
+	return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
+}
+
+/* ARGSUSED */
+static int
+flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
+{
+	flow_l2info_t			*l2 = &s->fs_l2info;
+	struct ether_vlan_header	*evhp;
+	uint16_t			sap;
+
+	evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
+	l2->l2_start = (uchar_t *)evhp;
+
+	if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
+		return (ENOBUFS);
+
+	if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
+	    ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
+		if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
+			return (ENOBUFS);
+
+		l2->l2_sap = ntohs(evhp->ether_type);
+		l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
+		l2->l2_hdrsize = sizeof (struct ether_vlan_header);
+	} else {
+		l2->l2_sap = sap;
+		l2->l2_vid = 0;
+		l2->l2_hdrsize = sizeof (struct ether_header);
+	}
+	return (0);
+}
+
+/*
+ * Validates a layer 2 flow entry.
+ */
+static int
+flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+	int		i;
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+
+	/*
+	 * Dest address is mandatory.
+	 */
+	if ((fd->fd_mask & FLOW_LINK_DST) == 0)
+		return (EINVAL);
+
+	for (i = 0; i < fd->fd_mac_len; i++) {
+		if (fd->fd_dst_mac[i] != 0)
+			break;
+	}
+	if (i == fd->fd_mac_len || fd->fd_mac_len < ETHERADDRL)
+		return (EINVAL);
+
+	if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
+		/*
+		 * VLAN flows are only supported over ethernet macs.
+		 */
+		if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
+			return (EINVAL);
+
+		if (fd->fd_vid == 0)
+			return (EINVAL);
+
+	}
+	flent->fe_match = flow_l2_match;
+	return (0);
+}
+
+/*
+ * Calculates hash index of flow entry.
+ */
+static uint32_t
+flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+
+	ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
+	return (HASH_MAC_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
+}
+
+/*
+ * This is used for duplicate flow checking.
+ */
+/* ARGSUSED */
+static boolean_t
+flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
+{
+	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
+
+	ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
+	return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
+	    fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
+}
+
+/*
+ * Generic flow entry insertion function.
+ * Used by flow tables that do not have ordering requirements.
+ */
+/* ARGSUSED */
+static int
+flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
+    flow_entry_t *flent)
+{
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+
+	if (*headp != NULL) {
+		ASSERT(flent->fe_next == NULL);
+		flent->fe_next = *headp;
+	}
+	*headp = flent;
+	return (0);
+}
+
+/*
+ * IP version independent DSField matching function.
+ */
+/* ARGSUSED */
+static boolean_t
+flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+	flow_l3info_t	*l3info = &s->fs_l3info;
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+
+	switch (l3info->l3_version) {
+	case IPV4_VERSION: {
+		ipha_t		*ipha = (ipha_t *)l3info->l3_start;
+
+		return ((ipha->ipha_type_of_service &
+		    fd->fd_dsfield_mask) == fd->fd_dsfield);
+	}
+	case IPV6_VERSION: {
+		ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
+
+		return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
+		    fd->fd_dsfield_mask) == fd->fd_dsfield);
+	}
+	default:
+		return (B_FALSE);
+	}
+}
+
+/*
+ * IP v4 and v6 address matching.
+ * The netmask only needs to be applied on the packet but not on the
+ * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
+ */
+
+/* ARGSUSED */
+static boolean_t
+flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+	flow_l3info_t	*l3info = &s->fs_l3info;
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+	ipha_t		*ipha = (ipha_t *)l3info->l3_start;
+	in_addr_t	addr;
+
+	addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
+	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
+		return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
+		    V4_PART_OF_V6(fd->fd_local_addr));
+	}
+	return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
+	    V4_PART_OF_V6(fd->fd_remote_addr));
+}
+
+/* ARGSUSED */
+static boolean_t
+flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+	flow_l3info_t	*l3info = &s->fs_l3info;
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+	ip6_t		*ip6h = (ip6_t *)l3info->l3_start;
+	in6_addr_t	*addrp;
+
+	addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
+	if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
+		return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
+		    fd->fd_local_addr));
+	}
+	return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
+}
+
+/* ARGSUSED */
+static boolean_t
+flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+	flow_l3info_t	*l3info = &s->fs_l3info;
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+
+	return (l3info->l3_protocol == fd->fd_protocol);
+}
+
+static uint32_t
+flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
+{
+	flow_l3info_t	*l3info = &s->fs_l3info;
+	flow_mask_t	mask = ft->ft_mask;
+
+	if ((mask & FLOW_IP_LOCAL) != 0) {
+		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
+	} else if ((mask & FLOW_IP_REMOTE) != 0) {
+		l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
+	} else if ((mask & FLOW_IP_DSFIELD) != 0) {
+		/*
+		 * DSField flents are arranged as a single list.
+		 */
+		return (0);
+	}
+	/*
+	 * IP addr flents are hashed into two lists, v4 or v6.
+	 */
+	ASSERT(ft->ft_size >= 2);
+	return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
+}
+
+static uint32_t
+flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
+{
+	flow_l3info_t	*l3info = &s->fs_l3info;
+
+	return (l3info->l3_protocol % ft->ft_size);
+}
+
+/* ARGSUSED */
+static int
+flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
+{
+	flow_l2info_t	*l2info = &s->fs_l2info;
+	flow_l3info_t	*l3info = &s->fs_l3info;
+	uint16_t	sap = l2info->l2_sap;
+	uchar_t		*l3_start;
+
+	l3info->l3_start = l3_start = l2info->l2_start + l2info->l2_hdrsize;
+	if (!OK_32PTR(l3_start))
+		return (EINVAL);
+
+	switch (sap) {
+	case ETHERTYPE_IP: {
+		ipha_t	*ipha = (ipha_t *)l3_start;
+
+		if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
+			return (ENOBUFS);
+
+		l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
+		l3info->l3_protocol = ipha->ipha_protocol;
+		l3info->l3_version = IPV4_VERSION;
+		l3info->l3_fragmented =
+		    IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
+		break;
+	}
+	case ETHERTYPE_IPV6: {
+		ip6_t   *ip6h = (ip6_t *)l3_start;
+		uint16_t ip6_hdrlen;
+		uint8_t	 nexthdr;
+
+		if (!mac_ip_hdr_length_v6(s->fs_mp, ip6h, &ip6_hdrlen,
+		    &nexthdr)) {
+			return (ENOBUFS);
+		}
+		l3info->l3_hdrsize = ip6_hdrlen;
+		l3info->l3_protocol = nexthdr;
+		l3info->l3_version = IPV6_VERSION;
+		l3info->l3_fragmented = B_FALSE;
+		break;
+	}
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+
+	switch (fd->fd_protocol) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_SCTP:
+	case IPPROTO_ICMP:
+	case IPPROTO_ICMPV6:
+		flent->fe_match = flow_ip_proto_match;
+		return (0);
+	default:
+		return (EINVAL);
+	}
+}
+
+/* ARGSUSED */
+static int
+flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+	flow_mask_t	mask;
+	uint8_t		version;
+	in6_addr_t	*addr, *netmask;
+
+	/*
+	 * DSField does not require a IP version.
+	 */
+	if (fd->fd_mask == FLOW_IP_DSFIELD) {
+		if (fd->fd_dsfield_mask == 0)
+			return (EINVAL);
+
+		flent->fe_match = flow_ip_dsfield_match;
+		return (0);
+	}
+
+	/*
+	 * IP addresses must come with a version to avoid ambiguity.
+	 */
+	if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
+		return (EINVAL);
+
+	version = fd->fd_ipversion;
+	if (version != IPV4_VERSION && version != IPV6_VERSION)
+		return (EINVAL);
+
+	mask = fd->fd_mask & ~FLOW_IP_VERSION;
+	switch (mask) {
+	case FLOW_IP_LOCAL:
+		addr = &fd->fd_local_addr;
+		netmask = &fd->fd_local_netmask;
+		break;
+	case FLOW_IP_REMOTE:
+		addr = &fd->fd_remote_addr;
+		netmask = &fd->fd_remote_netmask;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	/*
+	 * Apply netmask onto specified address.
+	 */
+	V6_MASK_COPY(*addr, *netmask, *addr);
+	if (version == IPV4_VERSION) {
+		ipaddr_t	v4addr = V4_PART_OF_V6((*addr));
+		ipaddr_t	v4mask = V4_PART_OF_V6((*netmask));
+
+		if (v4addr == 0 || v4mask == 0)
+			return (EINVAL);
+		flent->fe_match = flow_ip_v4_match;
+	} else {
+		if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
+		    IN6_IS_ADDR_UNSPECIFIED(netmask))
+			return (EINVAL);
+		flent->fe_match = flow_ip_v6_match;
+	}
+	return (0);
+}
+
+static uint32_t
+flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+
+	return (fd->fd_protocol % ft->ft_size);
+}
+
+static uint32_t
+flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+
+	/*
+	 * DSField flents are arranged as a single list.
+	 */
+	if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
+		return (0);
+
+	/*
+	 * IP addr flents are hashed into two lists, v4 or v6.
+	 */
+	ASSERT(ft->ft_size >= 2);
+	return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
+}
+
+/* ARGSUSED */
+static boolean_t
+flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
+{
+	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
+
+	return (fd1->fd_protocol == fd2->fd_protocol);
+}
+
+/* ARGSUSED */
+static boolean_t
+flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
+{
+	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
+	in6_addr_t	*a1, *m1, *a2, *m2;
+
+	ASSERT(fd1->fd_mask == fd2->fd_mask);
+	if (fd1->fd_mask == FLOW_IP_DSFIELD) {
+		return (fd1->fd_dsfield == fd2->fd_dsfield &&
+		    fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
+	}
+
+	/*
+	 * flow_ip_accept_fe() already validated the version.
+	 */
+	ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
+	if (fd1->fd_ipversion != fd2->fd_ipversion)
+		return (B_FALSE);
+
+	switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
+	case FLOW_IP_LOCAL:
+		a1 = &fd1->fd_local_addr;
+		m1 = &fd1->fd_local_netmask;
+		a2 = &fd2->fd_local_addr;
+		m2 = &fd2->fd_local_netmask;
+		break;
+	case FLOW_IP_REMOTE:
+		a1 = &fd1->fd_remote_addr;
+		m1 = &fd1->fd_remote_netmask;
+		a2 = &fd2->fd_remote_addr;
+		m2 = &fd2->fd_remote_netmask;
+		break;
+	default:
+		/*
+		 * This is unreachable given the checks in
+		 * flow_ip_accept_fe().
+		 */
+		return (B_FALSE);
+	}
+
+	if (fd1->fd_ipversion == IPV4_VERSION) {
+		return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
+		    V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
+
+	} else {
+		return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
+		    IN6_ARE_ADDR_EQUAL(m1, m2));
+	}
+}
+
+static int
+flow_ip_mask2plen(in6_addr_t *v6mask)
+{
+	int		bits;
+	int		plen = IPV6_ABITS;
+	int		i;
+
+	for (i = 3; i >= 0; i--) {
+		if (v6mask->s6_addr32[i] == 0) {
+			plen -= 32;
+			continue;
+		}
+		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
+		if (bits == 0)
+			break;
+		plen -= bits;
+	}
+	return (plen);
+}
+
+/* ARGSUSED */
+static int
+flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
+    flow_entry_t *flent)
+{
+	flow_entry_t	**p = headp;
+	flow_desc_t	*fd0, *fd;
+	in6_addr_t	*m0, *m;
+	int		plen0, plen;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
+
+	/*
+	 * No special ordering needed for dsfield.
+	 */
+	fd0 = &flent->fe_flow_desc;
+	if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
+		if (*p != NULL) {
+			ASSERT(flent->fe_next == NULL);
+			flent->fe_next = *p;
+		}
+		*p = flent;
+		return (0);
+	}
+
+	/*
+	 * IP address flows are arranged in descending prefix length order.
+	 */
+	m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
+	    &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
+	plen0 = flow_ip_mask2plen(m0);
+	ASSERT(plen0 != 0);
+
+	for (; *p != NULL; p = &(*p)->fe_next) {
+		fd = &(*p)->fe_flow_desc;
+
+		/*
+		 * Normally a dsfield flent shouldn't end up on the same
+		 * list as an IP address because flow tables are (for now)
+		 * disjoint. If we decide to support both IP and dsfield
+		 * in the same table in the future, this check will allow
+		 * for that.
+		 */
+		if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
+			continue;
+
+		/*
+		 * We also allow for the mixing of local and remote address
+		 * flents within one list.
+		 */
+		m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
+		    &fd->fd_local_netmask : &fd->fd_remote_netmask;
+		plen = flow_ip_mask2plen(m);
+
+		if (plen <= plen0)
+			break;
+	}
+	if (*p != NULL) {
+		ASSERT(flent->fe_next == NULL);
+		flent->fe_next = *p;
+	}
+	*p = flent;
+	return (0);
+}
+
+/*
+ * Transport layer protocol and port matching functions.
+ */
+
+/* ARGSUSED */
+static boolean_t
+flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+	flow_l3info_t	*l3info = &s->fs_l3info;
+	flow_l4info_t	*l4info = &s->fs_l4info;
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+
+	return (fd->fd_protocol == l3info->l3_protocol &&
+	    fd->fd_local_port == l4info->l4_hash_port);
+}
+
+/* ARGSUSED */
+static boolean_t
+flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
+{
+	flow_l3info_t	*l3info = &s->fs_l3info;
+	flow_l4info_t	*l4info = &s->fs_l4info;
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+
+	return (fd->fd_protocol == l3info->l3_protocol &&
+	    fd->fd_remote_port == l4info->l4_hash_port);
+}
+
+/*
+ * Transport hash function.
+ * Since we only support either local or remote port flows,
+ * we only need to extract one of the ports to be used for
+ * matching.
+ */
+static uint32_t
+flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
+{
+	flow_l3info_t	*l3info = &s->fs_l3info;
+	flow_l4info_t	*l4info = &s->fs_l4info;
+	uint8_t		proto = l3info->l3_protocol;
+	boolean_t	dst_or_src;
+
+	if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
+		dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
+	} else {
+		dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
+	}
+
+	l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
+	    l4info->l4_src_port;
+
+	return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
+}
+
+/*
+ * Unlike other accept() functions above, we do not need to get the header
+ * size because this is our highest layer so far. If we want to do support
+ * other higher layer protocols, we would need to save the l4_hdrsize
+ * in the code below.
+ */
+
+/* ARGSUSED */
+static int
+flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
+{
+	flow_l3info_t	*l3info = &s->fs_l3info;
+	flow_l4info_t	*l4info = &s->fs_l4info;
+	uint8_t		proto = l3info->l3_protocol;
+	uchar_t		*l4_start;
+
+	l4info->l4_start = l4_start = l3info->l3_start + l3info->l3_hdrsize;
+	if (!OK_32PTR(l4_start))
+		return (EINVAL);
+
+	if (l3info->l3_fragmented == B_TRUE)
+		return (EINVAL);
+
+	switch (proto) {
+	case IPPROTO_TCP: {
+		struct tcphdr	*tcph = (struct tcphdr *)l4_start;
+
+		if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
+			return (ENOBUFS);
+
+		l4info->l4_src_port = tcph->th_sport;
+		l4info->l4_dst_port = tcph->th_dport;
+		break;
+	}
+	case IPPROTO_UDP: {
+		struct udphdr	*udph = (struct udphdr *)l4_start;
+
+		if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
+			return (ENOBUFS);
+
+		l4info->l4_src_port = udph->uh_sport;
+		l4info->l4_dst_port = udph->uh_dport;
+		break;
+	}
+	case IPPROTO_SCTP: {
+		sctp_hdr_t	*sctph = (sctp_hdr_t *)l4_start;
+
+		if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
+			return (ENOBUFS);
+
+		l4info->l4_src_port = sctph->sh_sport;
+		l4info->l4_dst_port = sctph->sh_dport;
+		break;
+	}
+	default:
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * Validates transport flow entry.
+ * The protocol field must be present.
+ */
+
+/* ARGSUSED */
+static int
+flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+	flow_mask_t	mask = fd->fd_mask;
+
+	if ((mask & FLOW_IP_PROTOCOL) == 0)
+		return (EINVAL);
+
+	switch (fd->fd_protocol) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_SCTP:
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	switch (mask & ~FLOW_IP_PROTOCOL) {
+	case FLOW_ULP_PORT_LOCAL:
+		if (fd->fd_local_port == 0)
+			return (EINVAL);
+
+		flent->fe_match = flow_transport_lport_match;
+		break;
+	case FLOW_ULP_PORT_REMOTE:
+		if (fd->fd_remote_port == 0)
+			return (EINVAL);
+
+		flent->fe_match = flow_transport_rport_match;
+		break;
+	case 0:
+		/*
+		 * transport-only flows conflicts with our table type.
+		 */
+		return (EOPNOTSUPP);
+	default:
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+static uint32_t
+flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
+{
+	flow_desc_t	*fd = &flent->fe_flow_desc;
+	uint16_t	port = 0;
+
+	port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
+	    fd->fd_local_port : fd->fd_remote_port;
+
+	return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
+}
+
+/* ARGSUSED */
+static boolean_t
+flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
+{
+	flow_desc_t	*fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
+
+	if (fd1->fd_protocol != fd2->fd_protocol)
+		return (B_FALSE);
+
+	if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
+		return (fd1->fd_local_port == fd2->fd_local_port);
+
+	return (fd1->fd_remote_port == fd2->fd_remote_port);
+}
+
+static flow_ops_t flow_l2_ops = {
+	flow_l2_accept_fe,
+	flow_l2_hash_fe,
+	flow_l2_match_fe,
+	flow_generic_insert_fe,
+	flow_l2_hash,
+	{flow_l2_accept}
+};
+
+static flow_ops_t flow_ip_ops = {
+	flow_ip_accept_fe,
+	flow_ip_hash_fe,
+	flow_ip_match_fe,
+	flow_ip_insert_fe,
+	flow_ip_hash,
+	{flow_l2_accept, flow_ip_accept}
+};
+
+static flow_ops_t flow_ip_proto_ops = {
+	flow_ip_proto_accept_fe,
+	flow_ip_proto_hash_fe,
+	flow_ip_proto_match_fe,
+	flow_generic_insert_fe,
+	flow_ip_proto_hash,
+	{flow_l2_accept, flow_ip_accept}
+};
+
+static flow_ops_t flow_transport_ops = {
+	flow_transport_accept_fe,
+	flow_transport_hash_fe,
+	flow_transport_match_fe,
+	flow_generic_insert_fe,
+	flow_transport_hash,
+	{flow_l2_accept, flow_ip_accept, flow_transport_accept}
+};
+
+static flow_tab_info_t flow_tab_info_list[] = {
+	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
+	{&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
+	{&flow_ip_ops, FLOW_IP_DSFIELD, 1},
+	{&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
+	{&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024}
+};
+
+#define	FLOW_MAX_TAB_INFO \
+	((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
+
+static flow_tab_info_t *
+mac_flow_tab_info_get(flow_mask_t mask)
+{
+	int	i;
+
+	for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
+		if (mask == flow_tab_info_list[i].fti_mask)
+			return (&flow_tab_info_list[i]);
+	}
+	return (NULL);
+}
diff --git a/usr/src/uts/common/io/mac/mac_hio.c b/usr/src/uts/common/io/mac/mac_hio.c
new file mode 100644
index 0000000000..d930506ae7
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_hio.c
@@ -0,0 +1,182 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * MAC Hybrid I/O related code.
+ */
+
+#include <sys/types.h>
+#include <sys/sdt.h>
+#include <sys/mac.h>
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_soft_ring.h>
+
+
+/*
+ * Return the number of shares supported by the specified MAC.
+ */
+int
+mac_share_capable(mac_handle_t mh)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	return (mip->mi_share_capab.ms_snum);
+}
+
+
+/*
+ * Allocate a share to the specified MAC client. Invoked when
+ * mac_client_open() is invoked with MAC_OPEN_FLAGS_SHARES_DESIRED set.
+ */
+void
+i_mac_share_alloc(mac_client_impl_t *mcip)
+{
+	mac_impl_t *mip = mcip->mci_mip;
+	int rv;
+
+	i_mac_perim_enter(mip);
+
+	ASSERT(mcip->mci_share == NULL);
+
+	if (mac_share_capable((mac_handle_t)mcip->mci_mip) == 0) {
+		DTRACE_PROBE1(i__mac__share__alloc__not__sup,
+		    mac_client_impl_t *, mcip);
+		i_mac_perim_exit(mip);
+		return;
+	}
+
+	rv = mip->mi_share_capab.ms_salloc(mip->mi_share_capab.ms_handle,
+	    &mcip->mci_share);
+	DTRACE_PROBE3(i__mac__share__alloc, mac_client_impl_t *, mcip,
+	    int, rv, mac_share_handle_t, mcip->mci_share);
+
+	mcip->mci_share_bound = B_FALSE;
+
+	i_mac_perim_exit(mip);
+}
+
+
+/*
+ * Free a share previously allocated through i_mac_share_alloc().
+ * Safely handles the case when no shares were allocated to the MAC client.
+ */
+void
+i_mac_share_free(mac_client_impl_t *mcip)
+{
+	mac_impl_t *mip = mcip->mci_mip;
+
+	i_mac_perim_enter(mip);
+
+	/* MAC clients are required to unbind they shares before freeing them */
+	ASSERT(!mcip->mci_share_bound);
+
+	if (mcip->mci_share == NULL) {
+		i_mac_perim_exit(mip);
+		return;
+	}
+
+	mip->mi_share_capab.ms_sfree(mcip->mci_share);
+	i_mac_perim_exit(mip);
+}
+
+
+/*
+ * Bind a share. After this operation the rings that were associated
+ * with the MAC client are mapped directly into the corresponding
+ * guest domain.
+ */
+int
+mac_share_bind(mac_client_handle_t mch, uint64_t cookie, uint64_t *rcookie)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_impl_t *mip = mcip->mci_mip;
+	int rv;
+
+	i_mac_perim_enter(mip);
+
+	if (mcip->mci_share == NULL) {
+		i_mac_perim_exit(mip);
+		return (ENOTSUP);
+	}
+
+	ASSERT(!mcip->mci_share_bound);
+
+	/*
+	 * Temporarly suspend the TX traffic for that client to make sure
+	 * there are no in flight packets through a transmit ring
+	 * which is being bound to another domain.
+	 */
+	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
+
+	/*
+	 * For the receive path, no traffic will be sent up through
+	 * the rings to the IO domain. For TX, we need to ensure
+	 * that traffic sent by the MAC client are sent through
+	 * the default ring.
+	 *
+	 * For TX XXX will ensure that packets are sent through the
+	 * default ring if the share of the MAC client is bound.
+	 */
+
+	rv = mip->mi_share_capab.ms_sbind(mcip->mci_share, cookie, rcookie);
+	if (rv == 0)
+		mcip->mci_share_bound = B_TRUE;
+
+	/*
+	 * Resume TX traffic for the MAC client. Since mci_share_bound is set
+	 * to B_TRUE, mac_tx_send() will not send traffic to individual TX
+	 * rings until the share is unbound.
+	 */
+	mac_tx_client_restart(mcip);
+
+	i_mac_perim_exit(mip);
+
+	return (rv);
+}
+
+
+/*
+ * Unbind a share.
+ */
+void
+mac_share_unbind(mac_client_handle_t mch)
+{
+	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+	mac_impl_t *mip = mcip->mci_mip;
+
+	i_mac_perim_enter(mip);
+
+	if (mcip->mci_share == NULL) {
+		i_mac_perim_exit(mip);
+		return;
+	}
+
+	mip->mi_share_capab.ms_sunbind(mcip->mci_share);
+
+	mcip->mci_share_bound = B_FALSE;
+
+	i_mac_perim_exit(mip);
+}
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
new file mode 100644
index 0000000000..714fb79afb
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -0,0 +1,1031 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/id_space.h>
+#include <sys/esunddi.h>
+#include <sys/stat.h>
+#include <sys/mkdev.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/dlpi.h>
+#include <sys/modhash.h>
+#include <sys/mac.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/modctl.h>
+#include <sys/fs/dv_node.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/callb.h>
+#include <sys/cpuvar.h>
+#include <sys/atomic.h>
+#include <sys/sdt.h>
+#include <sys/mac_flow.h>
+#include <sys/ddi_intr_impl.h>
+#include <sys/disp.h>
+#include <sys/sdt.h>
+
+/*
+ * MAC Provider Interface.
+ *
+ * Interface for GLDv3 compatible NIC drivers.
+ */
+
+static void i_mac_notify_thread(void *);
+
+typedef void (*mac_notify_default_cb_fn_t)(mac_impl_t *);
+
+typedef struct mac_notify_default_cb_s {
+	mac_notify_type_t		mac_notify_type;
+	mac_notify_default_cb_fn_t	mac_notify_cb_fn;
+}mac_notify_default_cb_t;
+
+mac_notify_default_cb_t mac_notify_cb_list[] = {
+	{ MAC_NOTE_LINK,		mac_fanout_recompute},
+	{ MAC_NOTE_PROMISC,		NULL},
+	{ MAC_NOTE_UNICST,		NULL},
+	{ MAC_NOTE_TX,			NULL},
+	{ MAC_NOTE_RESOURCE,		NULL},
+	{ MAC_NOTE_DEVPROMISC,		NULL},
+	{ MAC_NOTE_FASTPATH_FLUSH,	NULL},
+	{ MAC_NOTE_SDU_SIZE,		NULL},
+	{ MAC_NOTE_MARGIN,		NULL},
+	{ MAC_NOTE_CAPAB_CHG,		NULL},
+	{ MAC_NNOTE,			NULL},
+};
+
+/*
+ * Driver support functions.
+ */
+
+/* REGISTRATION */
+
+mac_register_t *
+mac_alloc(uint_t mac_version)
+{
+	mac_register_t *mregp;
+
+	/*
+	 * Make sure there isn't a version mismatch between the driver and
+	 * the framework.  In the future, if multiple versions are
+	 * supported, this check could become more sophisticated.
+	 */
+	if (mac_version != MAC_VERSION)
+		return (NULL);
+
+	mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP);
+	mregp->m_version = mac_version;
+	return (mregp);
+}
+
+void
+mac_free(mac_register_t *mregp)
+{
+	kmem_free(mregp, sizeof (mac_register_t));
+}
+
+/*
+ * mac_register() is how drivers register new MACs with the GLDv3
+ * framework.  The mregp argument is allocated by drivers using the
+ * mac_alloc() function, and can be freed using mac_free() immediately upon
+ * return from mac_register().  Upon success (0 return value), the mhp
+ * opaque pointer becomes the driver's handle to its MAC interface, and is
+ * the argument to all other mac module entry points.
+ */
+/* ARGSUSED */
+int
+mac_register(mac_register_t *mregp, mac_handle_t *mhp)
+{
+	mac_impl_t		*mip;
+	mactype_t		*mtype;
+	int			err = EINVAL;
+	struct devnames		*dnp = NULL;
+	uint_t			instance;
+	boolean_t		style1_created = B_FALSE;
+	boolean_t		style2_created = B_FALSE;
+	mac_capab_legacy_t	legacy;
+	char			*driver;
+	minor_t			minor = 0;
+
+	/* Find the required MAC-Type plugin. */
+	if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL)
+		return (EINVAL);
+
+	/* Create a mac_impl_t to represent this MAC. */
+	mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP);
+
+	/*
+	 * The mac is not ready for open yet.
+	 */
+	mip->mi_state_flags |= MIS_DISABLED;
+
+	/*
+	 * When a mac is registered, the m_instance field can be set to:
+	 *
+	 *  0:	Get the mac's instance number from m_dip.
+	 *	This is usually used for physical device dips.
+	 *
+	 *  [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number.
+	 *	For example, when an aggregation is created with the key option,
+	 *	"key" will be used as the instance number.
+	 *
+	 *  -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1].
+	 *	This is often used when a MAC of a virtual link is registered
+	 *	(e.g., aggregation when "key" is not specified, or vnic).
+	 *
+	 * Note that the instance number is used to derive the mi_minor field
+	 * of mac_impl_t, which will then be used to derive the name of kstats
+	 * and the devfs nodes.  The first 2 cases are needed to preserve
+	 * backward compatibility.
+	 */
+	switch (mregp->m_instance) {
+	case 0:
+		instance = ddi_get_instance(mregp->m_dip);
+		break;
+	case ((uint_t)-1):
+		minor = mac_minor_hold(B_TRUE);
+		if (minor == 0) {
+			err = ENOSPC;
+			goto fail;
+		}
+		instance = minor - 1;
+		break;
+	default:
+		instance = mregp->m_instance;
+		if (instance >= MAC_MAX_MINOR) {
+			err = EINVAL;
+			goto fail;
+		}
+		break;
+	}
+
+	mip->mi_minor = (minor_t)(instance + 1);
+	mip->mi_dip = mregp->m_dip;
+	mip->mi_clients_list = NULL;
+	mip->mi_nclients = 0;
+
+	driver = (char *)ddi_driver_name(mip->mi_dip);
+
+	/* Construct the MAC name as <drvname><instance> */
+	(void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d",
+	    driver, instance);
+
+	mip->mi_driver = mregp->m_driver;
+
+	mip->mi_type = mtype;
+	mip->mi_margin = mregp->m_margin;
+	mip->mi_info.mi_media = mtype->mt_type;
+	mip->mi_info.mi_nativemedia = mtype->mt_nativetype;
+	if (mregp->m_max_sdu <= mregp->m_min_sdu)
+		goto fail;
+	mip->mi_sdu_min = mregp->m_min_sdu;
+	mip->mi_sdu_max = mregp->m_max_sdu;
+	mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length;
+	/*
+	 * If the media supports a broadcast address, cache a pointer to it
+	 * in the mac_info_t so that upper layers can use it.
+	 */
+	mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr;
+
+	mip->mi_v12n_level = mregp->m_v12n;
+
+	/*
+	 * Copy the unicast source address into the mac_info_t, but only if
+	 * the MAC-Type defines a non-zero address length.  We need to
+	 * handle MAC-Types that have an address length of 0
+	 * (point-to-point protocol MACs for example).
+	 */
+	if (mip->mi_type->mt_addr_length > 0) {
+		if (mregp->m_src_addr == NULL)
+			goto fail;
+		mip->mi_info.mi_unicst_addr =
+		    kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP);
+		bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr,
+		    mip->mi_type->mt_addr_length);
+
+		/*
+		 * Copy the fixed 'factory' MAC address from the immutable
+		 * info.  This is taken to be the MAC address currently in
+		 * use.
+		 */
+		bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr,
+		    mip->mi_type->mt_addr_length);
+
+		/*
+		 * At this point, we should set up the classification
+		 * rules etc but we delay it till mac_open() so that
+		 * the resource discovery has taken place and we
+		 * know someone wants to use the device. Otherwise
+		 * memory gets allocated for Rx ring structures even
+		 * during probe.
+		 */
+
+		/* Copy the destination address if one is provided. */
+		if (mregp->m_dst_addr != NULL) {
+			bcopy(mregp->m_dst_addr, mip->mi_dstaddr,
+			    mip->mi_type->mt_addr_length);
+		}
+	} else if (mregp->m_src_addr != NULL) {
+		goto fail;
+	}
+
+	/*
+	 * The format of the m_pdata is specific to the plugin.  It is
+	 * passed in as an argument to all of the plugin callbacks.  The
+	 * driver can update this information by calling
+	 * mac_pdata_update().
+	 */
+	if (mregp->m_pdata != NULL) {
+		/*
+		 * Verify that the plugin supports MAC plugin data and that
+		 * the supplied data is valid.
+		 */
+		if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
+			goto fail;
+		if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata,
+		    mregp->m_pdata_size)) {
+			goto fail;
+		}
+		mip->mi_pdata = kmem_alloc(mregp->m_pdata_size, KM_SLEEP);
+		bcopy(mregp->m_pdata, mip->mi_pdata, mregp->m_pdata_size);
+		mip->mi_pdata_size = mregp->m_pdata_size;
+	}
+
+	/*
+	 * Register the private properties.
+	 */
+	mac_register_priv_prop(mip, mregp->m_priv_props,
+	    mregp->m_priv_prop_count);
+
+	/*
+	 * Stash the driver callbacks into the mac_impl_t, but first sanity
+	 * check to make sure all mandatory callbacks are set.
+	 */
+	if (mregp->m_callbacks->mc_getstat == NULL ||
+	    mregp->m_callbacks->mc_start == NULL ||
+	    mregp->m_callbacks->mc_stop == NULL ||
+	    mregp->m_callbacks->mc_setpromisc == NULL ||
+	    mregp->m_callbacks->mc_multicst == NULL) {
+		goto fail;
+	}
+	mip->mi_callbacks = mregp->m_callbacks;
+
+	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY, &legacy))
+		mip->mi_state_flags |= MIS_LEGACY;
+
+	if (mip->mi_state_flags & MIS_LEGACY) {
+		mip->mi_unsup_note = legacy.ml_unsup_note;
+		mip->mi_phy_dev = legacy.ml_dev;
+	} else {
+		mip->mi_unsup_note = 0;
+		mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip),
+		    ddi_get_instance(mip->mi_dip) + 1);
+	}
+
+	/*
+	 * Allocate a notification thread. thread_create blocks for memory
+	 * if needed, it never fails.
+	 */
+	mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread,
+	    mip, 0, &p0, TS_RUN, minclsyspri);
+
+	/*
+	 * Initialize the capabilities
+	 */
+
+	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL))
+		mip->mi_state_flags |= MIS_IS_VNIC;
+
+	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
+		mip->mi_state_flags |= MIS_IS_AGGR;
+
+	mac_addr_factory_init(mip);
+
+	/*
+	 * Enforce the virtrualization level registered.
+	 */
+	if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) {
+		if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 ||
+		    mac_init_rings(mip, MAC_RING_TYPE_TX) != 0)
+			goto fail;
+
+		/*
+		 * The driver needs to register at least rx rings for this
+		 * virtualization level.
+		 */
+		if (mip->mi_rx_groups == NULL)
+			goto fail;
+	}
+
+	/*
+	 * The driver must set mc_unicst entry point to NULL when it advertises
+	 * CAP_RINGS for rx groups.
+	 */
+	if (mip->mi_rx_groups != NULL) {
+		if (mregp->m_callbacks->mc_unicst != NULL)
+			goto fail;
+	} else {
+		if (mregp->m_callbacks->mc_unicst == NULL)
+			goto fail;
+	}
+
+	/*
+	 * The driver must set mc_tx entry point to NULL when it advertises
+	 * CAP_RINGS for tx rings.
+	 */
+	if (mip->mi_tx_groups != NULL) {
+		if (mregp->m_callbacks->mc_tx != NULL)
+			goto fail;
+	} else {
+		if (mregp->m_callbacks->mc_tx == NULL)
+			goto fail;
+	}
+
+	/*
+	 * Initialize MAC addresses. Must be called after mac_init_rings().
+	 */
+	mac_init_macaddr(mip);
+
+	mip->mi_share_capab.ms_snum = 0;
+	if (mip->mi_v12n_level & MAC_VIRT_HIO) {
+		(void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES,
+		    &mip->mi_share_capab);
+	}
+
+	/*
+	 * Initialize the kstats for this device.
+	 */
+	mac_stat_create(mip);
+
+	/* Zero out any properties. */
+	bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t));
+
+	/* set the gldv3 flag in dn_flags */
+	dnp = &devnamesp[ddi_driver_major(mip->mi_dip)];
+	LOCK_DEV_OPS(&dnp->dn_lock);
+	dnp->dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER);
+	UNLOCK_DEV_OPS(&dnp->dn_lock);
+
+	if (mip->mi_minor < MAC_MAX_MINOR + 1) {
+		/* Create a style-2 DLPI device */
+		if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0,
+		    DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS)
+			goto fail;
+		style2_created = B_TRUE;
+
+		/* Create a style-1 DLPI device */
+		if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR,
+		    mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS)
+			goto fail;
+		style1_created = B_TRUE;
+	}
+
+	mac_flow_l2tab_create(mip, &mip->mi_flow_tab);
+
+	rw_enter(&i_mac_impl_lock, RW_WRITER);
+	if (mod_hash_insert(i_mac_impl_hash,
+	    (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) {
+		rw_exit(&i_mac_impl_lock);
+		err = EEXIST;
+		goto fail;
+	}
+
+	DTRACE_PROBE2(mac__register, struct devnames *, dnp,
+	    (mac_impl_t *), mip);
+
+	/*
+	 * Mark the MAC to be ready for open.
+	 */
+	mip->mi_state_flags &= ~MIS_DISABLED;
+	rw_exit(&i_mac_impl_lock);
+
+	atomic_inc_32(&i_mac_impl_count);
+
+	cmn_err(CE_NOTE, "!%s registered", mip->mi_name);
+	*mhp = (mac_handle_t)mip;
+	return (0);
+
+fail:
+	if (style1_created)
+		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
+
+	if (style2_created)
+		ddi_remove_minor_node(mip->mi_dip, driver);
+
+	mac_addr_factory_fini(mip);
+
+	/* Clean up registered MAC addresses */
+	mac_fini_macaddr(mip);
+
+	/* Clean up registered rings */
+	mac_free_rings(mip, MAC_RING_TYPE_RX);
+	mac_free_rings(mip, MAC_RING_TYPE_TX);
+
+	/* Clean up notification thread */
+	if (mip->mi_notify_thread != NULL)
+		i_mac_notify_exit(mip);
+
+	if (mip->mi_info.mi_unicst_addr != NULL) {
+		kmem_free(mip->mi_info.mi_unicst_addr,
+		    mip->mi_type->mt_addr_length);
+		mip->mi_info.mi_unicst_addr = NULL;
+	}
+
+	mac_stat_destroy(mip);
+
+	if (mip->mi_type != NULL) {
+		atomic_dec_32(&mip->mi_type->mt_ref);
+		mip->mi_type = NULL;
+	}
+
+	if (mip->mi_pdata != NULL) {
+		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
+		mip->mi_pdata = NULL;
+		mip->mi_pdata_size = 0;
+	}
+
+	if (minor != 0) {
+		ASSERT(minor > MAC_MAX_MINOR);
+		mac_minor_rele(minor);
+	}
+
+	mac_unregister_priv_prop(mip);
+
+	kmem_cache_free(i_mac_impl_cachep, mip);
+	return (err);
+}
+
+/*
+ * Unregister from the GLDv3 framework
+ */
+int
+mac_unregister(mac_handle_t mh)
+{
+	int			err;
+	mac_impl_t		*mip = (mac_impl_t *)mh;
+	mod_hash_val_t		val;
+	mac_margin_req_t	*mmr, *nextmmr;
+
+	/* Fail the unregister if there are any open references to this mac. */
+	if ((err = mac_disable_nowait(mh)) != 0)
+		return (err);
+
+	/*
+	 * Clean up notification thread and wait for it to exit.
+	 */
+	i_mac_notify_exit(mip);
+
+	i_mac_perim_enter(mip);
+
+	if (mip->mi_minor < MAC_MAX_MINOR + 1) {
+		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
+		ddi_remove_minor_node(mip->mi_dip,
+		    (char *)ddi_driver_name(mip->mi_dip));
+	}
+
+	ASSERT(mip->mi_nactiveclients == 0 && !(mip->mi_state_flags &
+	    MIS_EXCLUSIVE));
+
+	mac_stat_destroy(mip);
+
+	(void) mod_hash_remove(i_mac_impl_hash,
+	    (mod_hash_key_t)mip->mi_name, &val);
+	ASSERT(mip == (mac_impl_t *)val);
+
+	ASSERT(i_mac_impl_count > 0);
+	atomic_dec_32(&i_mac_impl_count);
+
+	if (mip->mi_pdata != NULL)
+		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
+	mip->mi_pdata = NULL;
+	mip->mi_pdata_size = 0;
+
+	/*
+	 * Free the list of margin request.
+	 */
+	for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) {
+		nextmmr = mmr->mmr_nextp;
+		kmem_free(mmr, sizeof (mac_margin_req_t));
+	}
+	mip->mi_mmrp = NULL;
+
+	mip->mi_linkstate = LINK_STATE_UNKNOWN;
+	kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length);
+	mip->mi_info.mi_unicst_addr = NULL;
+
+	atomic_dec_32(&mip->mi_type->mt_ref);
+	mip->mi_type = NULL;
+
+	/*
+	 * Free the primary MAC address.
+	 */
+	mac_fini_macaddr(mip);
+
+	/*
+	 * free all rings
+	 */
+	mac_free_rings(mip, MAC_RING_TYPE_RX);
+	mac_free_rings(mip, MAC_RING_TYPE_TX);
+
+	mac_addr_factory_fini(mip);
+
+	bzero(mip->mi_addr, MAXMACADDRLEN);
+	bzero(mip->mi_dstaddr, MAXMACADDRLEN);
+
+	/* and the flows */
+	mac_flow_tab_destroy(mip->mi_flow_tab);
+	mip->mi_flow_tab = NULL;
+
+	if (mip->mi_minor > MAC_MAX_MINOR)
+		mac_minor_rele(mip->mi_minor);
+
+	cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name);
+
+	/*
+	 * Reset the perim related fields to default values before
+	 * kmem_cache_free
+	 */
+	i_mac_perim_exit(mip);
+	mip->mi_state_flags = 0;
+
+	mac_unregister_priv_prop(mip);
+	kmem_cache_free(i_mac_impl_cachep, mip);
+
+	return (0);
+}
+
+/* DATA RECEPTION */
+
+/*
+ * This function is invoked for packets received by the MAC driver in
+ * interrupt context. The ring generation number provided by the driver
+ * is matched with the ring generation number held in MAC. If they do not
+ * match, received packets are considered stale packets coming from an older
+ * assignment of the ring. Drop them.
+ */
+void
+mac_rx_ring(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp_chain,
+    uint64_t mr_gen_num)
+{
+	mac_ring_t		*mr = (mac_ring_t *)mrh;
+
+	if ((mr != NULL) && (mr->mr_gen_num != mr_gen_num)) {
+		DTRACE_PROBE2(mac__rx__rings__stale__packet, uint64_t,
+		    mr->mr_gen_num, uint64_t, mr_gen_num);
+		freemsgchain(mp_chain);
+		return;
+	}
+	mac_rx(mh, (mac_resource_handle_t)mrh, mp_chain);
+}
+
+/*
+ * This function is invoked for each packet received by the underlying
+ * driver.
+ */
+void
+mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
+{
+	mac_impl_t		*mip = (mac_impl_t *)mh;
+	mac_ring_t		*mr = (mac_ring_t *)mrh;
+	mac_soft_ring_set_t 	*mac_srs;
+	mblk_t			*bp = mp_chain;
+	boolean_t		hw_classified = B_FALSE;
+
+	/*
+	 * If there are any promiscuous mode callbacks defined for
+	 * this MAC, pass them a copy if appropriate.
+	 */
+	if (mip->mi_promisc_list != NULL)
+		mac_promisc_dispatch(mip, mp_chain, NULL);
+
+	if (mr != NULL) {
+		/*
+		 * If the SRS teardown has started, just return. The 'mr'
+		 * continues to be valid until the driver unregisters the mac.
+		 * Hardware classified packets will not make their way up
+		 * beyond this point once the teardown has started. The driver
+		 * is never passed a pointer to a flow entry or SRS or any
+		 * structure that can be freed much before mac_unregister.
+		 */
+		mutex_enter(&mr->mr_lock);
+		if ((mr->mr_state != MR_INUSE) || (mr->mr_flag &
+		    (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) {
+			mutex_exit(&mr->mr_lock);
+			freemsgchain(mp_chain);
+			return;
+		}
+		if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
+			hw_classified = B_TRUE;
+			MR_REFHOLD_LOCKED(mr);
+		}
+		mutex_exit(&mr->mr_lock);
+
+		/*
+		 * We check if an SRS is controlling this ring.
+		 * If so, we can directly call the srs_lower_proc
+		 * routine otherwise we need to go through mac_rx_classify
+		 * to reach the right place.
+		 */
+		if (hw_classified) {
+			mac_srs = mr->mr_srs;
+			/*
+			 * This is supposed to be the fast path.
+			 * All packets received though here were steered by
+			 * the hardware classifier, and share the same
+			 * MAC header info.
+			 */
+			mac_srs->srs_rx.sr_lower_proc(mh,
+			    (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE);
+			MR_REFRELE(mr);
+			return;
+		}
+		/* We'll fall through to software classification */
+	}
+
+	if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) {
+		if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL)
+			return;
+	}
+
+	freemsgchain(bp);
+}
+
+/* DATA TRANSMISSION */
+
+/*
+ * A driver's notification to resume transmission, in case of a provider
+ * without TX rings.
+ */
+void
+mac_tx_update(mac_handle_t mh)
+{
+	/*
+	 * Walk the list of MAC clients (mac_client_handle)
+	 * and update
+	 */
+	i_mac_tx_srs_notify((mac_impl_t *)mh, NULL);
+}
+
+/*
+ * A driver's notification to resume transmission on the specified TX ring.
+ */
+void
+mac_tx_ring_update(mac_handle_t mh, mac_ring_handle_t rh)
+{
+	i_mac_tx_srs_notify((mac_impl_t *)mh, rh);
+}
+
+/* LINK STATE */
+/*
+ * Notify the MAC layer about a link state change
+ */
+void
+mac_link_update(mac_handle_t mh, link_state_t link)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	/*
+	 * Save the link state.
+	 */
+	mip->mi_linkstate = link;
+
+	/*
+	 * Send a MAC_NOTE_LINK notification.
+	 */
+	i_mac_notify(mip, MAC_NOTE_LINK);
+}
+
+/* OTHER CONTROL INFORMATION */
+
+/*
+ * A driver notified us that its primary MAC address has changed.
+ */
+void
+mac_unicst_update(mac_handle_t mh, const uint8_t *addr)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	if (mip->mi_type->mt_addr_length == 0)
+		return;
+
+	i_mac_perim_enter(mip);
+	/*
+	 * If address doesn't change, do nothing.
+	 */
+	if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) == 0) {
+		i_mac_perim_exit(mip);
+		return;
+	}
+
+	/*
+	 * Freshen the MAC address value and update all MAC clients that
+	 * share this MAC address.
+	 */
+	mac_freshen_macaddr(mac_find_macaddr(mip, mip->mi_addr),
+	    (uint8_t *)addr);
+
+	i_mac_perim_exit(mip);
+
+	/*
+	 * Send a MAC_NOTE_UNICST notification.
+	 */
+	i_mac_notify(mip, MAC_NOTE_UNICST);
+}
+
+/*
+ * The provider's hw resources (e.g. rings grouping) has changed.
+ * Notify the MAC framework to trigger a re-negotiation of the capabilities.
+ */
+void
+mac_resource_update(mac_handle_t mh)
+{
+	/*
+	 * Send a MAC_NOTE_RESOURCE notification.
+	 */
+	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_RESOURCE);
+}
+
+/*
+ * MAC plugin information changed.
+ */
+int
+mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	/*
+	 * Verify that the plugin supports MAC plugin data and that the
+	 * supplied data is valid.
+	 */
+	if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
+		return (EINVAL);
+	if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize))
+		return (EINVAL);
+
+	if (mip->mi_pdata != NULL)
+		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
+
+	mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP);
+	bcopy(mac_pdata, mip->mi_pdata, dsize);
+	mip->mi_pdata_size = dsize;
+
+	/*
+	 * Since the MAC plugin data is used to construct MAC headers that
+	 * were cached in fast-path headers, we need to flush fast-path
+	 * information for links associated with this mac.
+	 */
+	i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH);
+	return (0);
+}
+
+/*
+ * Invoked by driver as well as the framework to notify its capability change.
+ */
+void
+mac_capab_update(mac_handle_t mh)
+{
+	/* Send MAC_NOTE_CAPAB_CHG notification */
+	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG);
+}
+
+int
+mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	if (sdu_max <= mip->mi_sdu_min)
+		return (EINVAL);
+	mip->mi_sdu_max = sdu_max;
+
+	/* Send a MAC_NOTE_SDU_SIZE notification. */
+	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
+	return (0);
+}
+
+/* PRIVATE FUNCTIONS, FOR INTERNAL USE ONLY */
+
+/*
+ * Updates the mac_impl structure with the current state of the link
+ */
+static void
+i_mac_log_link_state(mac_impl_t *mip)
+{
+	/*
+	 * If no change, then it is not interesting.
+	 */
+	if (mip->mi_lastlinkstate == mip->mi_linkstate)
+		return;
+
+	switch (mip->mi_linkstate) {
+	case LINK_STATE_UP:
+		if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) {
+			char det[200];
+
+			mip->mi_type->mt_ops.mtops_link_details(det,
+			    sizeof (det), (mac_handle_t)mip, mip->mi_pdata);
+
+			cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det);
+		} else {
+			cmn_err(CE_NOTE, "!%s link up", mip->mi_name);
+		}
+		break;
+
+	case LINK_STATE_DOWN:
+		/*
+		 * Only transitions from UP to DOWN are interesting
+		 */
+		if (mip->mi_lastlinkstate != LINK_STATE_UNKNOWN)
+			cmn_err(CE_NOTE, "!%s link down", mip->mi_name);
+		break;
+
+	case LINK_STATE_UNKNOWN:
+		/*
+		 * This case is normally not interesting.
+		 */
+		break;
+	}
+	mip->mi_lastlinkstate = mip->mi_linkstate;
+}
+
+/*
+ * Main routine for the callbacks notifications thread
+ */
+static void
+i_mac_notify_thread(void *arg)
+{
+	mac_impl_t	*mip = arg;
+	callb_cpr_t	cprinfo;
+	mac_cb_t	*mcb;
+	mac_cb_info_t	*mcbi;
+	mac_notify_cb_t	*mncb;
+
+	mcbi = &mip->mi_notify_cb_info;
+	CALLB_CPR_INIT(&cprinfo, mcbi->mcbi_lockp, callb_generic_cpr,
+	    "i_mac_notify_thread");
+
+	mutex_enter(mcbi->mcbi_lockp);
+
+	for (;;) {
+		uint32_t	bits;
+		uint32_t	type;
+
+		bits = mip->mi_notify_bits;
+		if (bits == 0) {
+			CALLB_CPR_SAFE_BEGIN(&cprinfo);
+			cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
+			CALLB_CPR_SAFE_END(&cprinfo, mcbi->mcbi_lockp);
+			continue;
+		}
+		mip->mi_notify_bits = 0;
+		if ((bits & (1 << MAC_NNOTE)) != 0) {
+			/* request to quit */
+			ASSERT(mip->mi_state_flags & MIS_DISABLED);
+			break;
+		}
+
+		mutex_exit(mcbi->mcbi_lockp);
+
+		/*
+		 * Log link changes.
+		 */
+		if ((bits & (1 << MAC_NOTE_LINK)) != 0)
+			i_mac_log_link_state(mip);
+
+		/*
+		 * Do notification callbacks for each notification type.
+		 */
+		for (type = 0; type < MAC_NNOTE; type++) {
+			if ((bits & (1 << type)) == 0) {
+				continue;
+			}
+
+			if (mac_notify_cb_list[type].mac_notify_cb_fn)
+				mac_notify_cb_list[type].mac_notify_cb_fn(mip);
+
+			/*
+			 * Walk the list of notifications.
+			 */
+			MAC_CALLBACK_WALKER_INC(&mip->mi_notify_cb_info);
+			for (mcb = mip->mi_notify_cb_list; mcb != NULL;
+			    mcb = mcb->mcb_nextp) {
+				mncb = (mac_notify_cb_t *)mcb->mcb_objp;
+				mncb->mncb_fn(mncb->mncb_arg, type);
+			}
+			MAC_CALLBACK_WALKER_DCR(&mip->mi_notify_cb_info,
+			    &mip->mi_notify_cb_list);
+		}
+
+		mutex_enter(mcbi->mcbi_lockp);
+	}
+
+	mip->mi_state_flags |= MIS_NOTIFY_DONE;
+	cv_broadcast(&mcbi->mcbi_cv);
+
+	/* CALLB_CPR_EXIT drops the lock */
+	CALLB_CPR_EXIT(&cprinfo);
+	thread_exit();
+}
+
+/*
+ * Signal the i_mac_notify_thread asking it to quit.
+ * Then wait till it is done.
+ */
+void
+i_mac_notify_exit(mac_impl_t *mip)
+{
+	mac_cb_info_t	*mcbi;
+
+	mcbi = &mip->mi_notify_cb_info;
+
+	mutex_enter(mcbi->mcbi_lockp);
+	mip->mi_notify_bits = (1 << MAC_NNOTE);
+	cv_broadcast(&mcbi->mcbi_cv);
+
+
+	while ((mip->mi_notify_thread != NULL) &&
+	    !(mip->mi_state_flags & MIS_NOTIFY_DONE)) {
+		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
+	}
+
+	/* Necessary clean up before doing kmem_cache_free */
+	mip->mi_state_flags &= ~MIS_NOTIFY_DONE;
+	mip->mi_notify_bits = 0;
+	mip->mi_notify_thread = NULL;
+	mutex_exit(mcbi->mcbi_lockp);
+}
+
+/*
+ * Entry point invoked by drivers to dynamically add a ring to an
+ * existing group.
+ */
+int
+mac_group_add_ring(mac_group_handle_t gh, int index)
+{
+	mac_group_t *group = (mac_group_t *)gh;
+	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
+	int ret;
+
+	i_mac_perim_enter(mip);
+
+	/*
+	 * Only RX rings can be added or removed by drivers currently.
+	 */
+	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
+
+	ret = i_mac_group_add_ring(group, NULL, index);
+
+	i_mac_perim_exit(mip);
+
+	return (ret);
+}
+
+/*
+ * Entry point invoked by drivers to dynamically remove a ring
+ * from an existing group. The specified ring handle must no longer
+ * be used by the driver after a call to this function.
+ */
+void
+mac_group_rem_ring(mac_group_handle_t gh, mac_ring_handle_t rh)
+{
+	mac_group_t *group = (mac_group_t *)gh;
+	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
+
+	i_mac_perim_enter(mip);
+
+	/*
+	 * Only RX rings can be added or removed by drivers currently.
+	 */
+	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
+
+	i_mac_group_rem_ring(group, (mac_ring_t *)rh, B_TRUE);
+
+	i_mac_perim_exit(mip);
+}
diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c
new file mode 100644
index 0000000000..290366f5d2
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_sched.c
@@ -0,0 +1,3819 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/callb.h>
+#include <sys/sdt.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/vlan.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ip_impl.h>
+#include <inet/sadb.h>
+#include <inet/ipsecesp.h>
+#include <inet/ipsecah.h>
+#include <inet/ip6.h>
+
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/mac_flow_impl.h>
+
+static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *,
+    uintptr_t, uint16_t, mblk_t **);
+static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *,
+    uintptr_t, uint16_t, mblk_t **);
+static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *,
+    uintptr_t, uint16_t, mblk_t **);
+static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *,
+    uintptr_t, uint16_t, mblk_t **);
+
+typedef struct mac_tx_mode_s {
+	mac_tx_srs_mode_t	mac_tx_mode;
+	mac_tx_func_t		mac_tx_func;
+} mac_tx_mode_t;
+
+/*
+ * There are five modes of operation on the Tx side. These modes get set
+ * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode,
+ * none of the other modes are user configurable. They get selected by
+ * the system depending upon whether the link (or flow) has multiple Tx
+ * rings or a bandwidth configured, etc.
+ */
+mac_tx_mode_t mac_tx_mode_list[] = {
+	{SRS_TX_DEFAULT,	mac_tx_single_ring_mode},
+	{SRS_TX_SERIALIZE,	mac_tx_serializer_mode},
+	{SRS_TX_FANOUT,		mac_tx_fanout_mode},
+	{SRS_TX_BW,		mac_tx_bw_mode},
+	{SRS_TX_BW_FANOUT,	mac_tx_bw_mode}
+};
+
+/*
+ * Soft Ring Set (SRS) - The Run time code that deals with
+ * dynamic polling from the hardware, bandwidth enforcement,
+ * fanout etc.
+ *
+ * We try to use H/W classification on NIC and assign traffic for
+ * a MAC address to a particular Rx ring or ring group. There is a
+ * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically
+ * switches the underlying Rx ring between interrupt and
+ * polling mode and enforces any specified B/W control.
+ *
+ * There is always a SRS created and tied to each H/W and S/W rule.
+ * Whenever we create a H/W rule, we always add the the same rule to
+ * S/W classifier and tie a SRS to it.
+ *
+ * In case a B/W control is specified, it is broken into bytes
+ * per ticks and as soon as the quota for a tick is exhausted,
+ * the underlying Rx ring is forced into poll mode for remainder of
+ * the tick. The SRS poll thread only polls for bytes that are
+ * allowed to come in the SRS. We typically let 4x the configured
+ * B/W worth of packets to come in the SRS (to prevent unnecessary
+ * drops due to bursts) but only process the specified amount.
+ *
+ * A MAC client (e.g. a VNIC or aggr) can have 1 or more
+ * Rx rings (and corresponding SRSs) assigned to it. The SRS
+ * in turn can have softrings to do protocol level fanout or
+ * softrings to do S/W based fanout or both. In case the NIC
+ * has no Rx rings, we do S/W classification to respective SRS.
+ * The S/W classification rule is always setup and ready. This
+ * allows the MAC layer to reassign Rx rings whenever needed
+ * but packets still continue to flow via the default path and
+ * getting S/W classified to correct SRS.
+ *
+ * The SRS's are used on both Tx and Rx side. They use the same
+ * data structure but the processing routines have slightly different
+ * semantics due to the fact that Rx side needs to do dynamic
+ * polling etc.
+ *
+ * Dynamic Polling Notes
+ * =====================
+ *
+ * Each Soft ring set is capable of switching its Rx ring between
+ * interrupt and poll mode and actively 'polls' for packets in
+ * poll mode. If the SRS is implementing a B/W limit, it makes
+ * sure that only Max allowed packets are pulled in poll mode
+ * and goes to poll mode as soon as B/W limit is exceeded. As
+ * such, there are no overheads to implement B/W limits.
+ *
+ * In poll mode, its better to keep the pipeline going where the
+ * SRS worker thread keeps processing packets and poll thread
+ * keeps bringing more packets (specially if they get to run
+ * on different CPUs). This also prevents the overheads associated
+ * by excessive signalling (on NUMA machines, this can be
+ * pretty devastating). The exception is latency optimized case
+ * where worker thread does no work and interrupt and poll thread
+ * are allowed to do their own drain.
+ *
+ * We use the following policy to control Dynamic Polling:
+ * 1) We switch to poll mode anytime the processing
+ *    thread causes a backlog to build up in SRS and
+ *    its associated Soft Rings (sr_poll_pkt_cnt > 0).
+ * 2) As long as the backlog stays under the low water
+ *    mark (sr_lowat), we poll the H/W for more packets.
+ * 3) If the backlog (sr_poll_pkt_cnt) exceeds low
+ *    water mark, we stay in poll mode but don't poll
+ *    the H/W for more packets.
+ * 4) Anytime in polling mode, if we poll the H/W for
+ *    packets and find nothing plus we have an existing
+ *    backlog (sr_poll_pkt_cnt > 0), we stay in polling
+ *    mode but don't poll the H/W for packets anymore
+ *    (let the polling thread go to sleep).
+ * 5) Once the backlog is relived (packets are processed)
+ *    we reenable polling (by signalling the poll thread)
+ *    only when the backlog dips below sr_poll_thres.
+ * 6) sr_hiwat is used exclusively when we are not
+ *    polling capable and is used to decide when to
+ *    drop packets so the SRS queue length doesn't grow
+ *    infinitely.
+ *
+ * NOTE: Also see the block level comment on top of mac_soft_ring.c
+ */
+
+/*
+ * mac_latency_optimize
+ *
+ * Controls whether the poll thread can process the packets inline
+ * or let the SRS worker thread do the processing. This applies if
+ * the SRS was not being processed. For latency sensitive traffic,
+ * this needs to be true to allow inline processing. For throughput
+ * under load, this should be false.
+ *
+ * This (and other similar) tunable should be rolled into a link
+ * or flow specific workload hint that can be set using dladm
+ * linkprop (instead of multiple such tunables).
+ */
+boolean_t mac_latency_optimize = B_TRUE;
+
+/*
+ * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN
+ *
+ * queue a mp or chain in soft ring set and increment the
+ * local count (srs_count) for the SRS and the shared counter
+ * (srs_poll_pkt_cnt - shared between SRS and its soft rings
+ * to track the total unprocessed packets for polling to work
+ * correctly).
+ *
+ * The size (total bytes queued) counters are incremented only
+ * if we are doing B/W control.
+ */
+#define	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {		\
+	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
+	if ((mac_srs)->srs_last != NULL)				\
+		(mac_srs)->srs_last->b_next = (head);			\
+	else								\
+		(mac_srs)->srs_first = (head);				\
+	(mac_srs)->srs_last = (tail);					\
+	(mac_srs)->srs_count += count;					\
+}
+
+#define	MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {	\
+	mac_srs_rx_t	*srs_rx = &(mac_srs)->srs_rx;			\
+									\
+	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);		\
+	srs_rx->sr_poll_pkt_cnt += count;				\
+	ASSERT(srs_rx->sr_poll_pkt_cnt > 0);				\
+	if ((mac_srs)->srs_type & SRST_BW_CONTROL) {			\
+		(mac_srs)->srs_size += (sz);				\
+		mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock);		\
+		(mac_srs)->srs_bw->mac_bw_sz += (sz);			\
+		mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock);		\
+	}								\
+}
+
+#define	MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {	\
+	mac_srs->srs_state |= SRS_ENQUEUED;				\
+	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);		\
+	if ((mac_srs)->srs_type & SRST_BW_CONTROL) {			\
+		(mac_srs)->srs_size += (sz);				\
+		(mac_srs)->srs_bw->mac_bw_sz += (sz);			\
+	}								\
+}
+
+/*
+ * Turn polling on routines
+ */
+#define	MAC_SRS_POLLING_ON(mac_srs) {					\
+	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
+	if (((mac_srs)->srs_state &					\
+	    (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) {	\
+		(mac_srs)->srs_state |= SRS_POLLING;			\
+		(void) mac_hwring_disable_intr((mac_ring_handle_t)	\
+		    (mac_srs)->srs_ring);				\
+		(mac_srs)->srs_rx.sr_poll_on++;				\
+	}								\
+}
+
+#define	MAC_SRS_WORKER_POLLING_ON(mac_srs) {				\
+	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
+	if (((mac_srs)->srs_state &					\
+	    (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == 		\
+	    (SRS_POLLING_CAPAB|SRS_WORKER)) {				\
+		(mac_srs)->srs_state |= SRS_POLLING;			\
+		(void) mac_hwring_disable_intr((mac_ring_handle_t)	\
+		    (mac_srs)->srs_ring);				\
+		(mac_srs)->srs_rx.sr_worker_poll_on++;			\
+	}								\
+}
+
+/*
+ * MAC_SRS_POLL_RING
+ *
+ * Signal the SRS poll thread to poll the underlying H/W ring
+ * provided it wasn't already polling (SRS_GET_PKTS was set).
+ *
+ * Poll thread gets to run only from mac_rx_srs_drain() and only
+ * if the drain was being done by the worker thread.
+ */
+#define	MAC_SRS_POLL_RING(mac_srs) {					\
+	mac_srs_rx_t	*srs_rx = &(mac_srs)->srs_rx;			\
+									\
+	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
+	srs_rx->sr_poll_thr_sig++;					\
+	if (((mac_srs)->srs_state & 					\
+	    (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) ==		\
+		(SRS_WORKER|SRS_POLLING_CAPAB)) {			\
+		(mac_srs)->srs_state |= SRS_GET_PKTS;			\
+		cv_signal(&(mac_srs)->srs_cv);   			\
+	} else {							\
+		srs_rx->sr_poll_thr_busy++;				\
+	}								\
+}
+
+/*
+ * MAC_SRS_CHECK_BW_CONTROL
+ *
+ * Check to see if next tick has started so we can reset the
+ * SRS_BW_ENFORCED flag and allow more packets to come in the
+ * system.
+ */
+#define	MAC_SRS_CHECK_BW_CONTROL(mac_srs) {				\
+	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
+	ASSERT(((mac_srs)->srs_type & SRST_TX) ||			\
+	    MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock));		\
+	if ((mac_srs)->srs_bw->mac_bw_curr_time != lbolt) {    		\
+		(mac_srs)->srs_bw->mac_bw_curr_time = lbolt;   		\
+		(mac_srs)->srs_bw->mac_bw_used = 0;	       		\
+		if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED)	\
+			(mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \
+	}								\
+}
+
+/*
+ * MAC_SRS_WORKER_WAKEUP
+ *
+ * Wake up the SRS worker thread to process the queue as long as
+ * no one else is processing the queue. If we are optimizing for
+ * latency, we wake up the worker thread immediately or else we
+ * wait mac_srs_worker_wakeup_ticks before worker thread gets
+ * woken up.
+ */
+int mac_srs_worker_wakeup_ticks = 0;
+#define	MAC_SRS_WORKER_WAKEUP(mac_srs) {				\
+	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
+	if (!((mac_srs)->srs_state & SRS_PROC) &&			\
+		(mac_srs)->srs_tid == NULL) {				\
+		if (mac_latency_optimize ||				\
+			(mac_srs_worker_wakeup_ticks == 0))		\
+			cv_signal(&(mac_srs)->srs_async);		\
+		else							\
+			(mac_srs)->srs_tid =				\
+				timeout(mac_srs_fire, (mac_srs),	\
+					mac_srs_worker_wakeup_ticks);	\
+	}								\
+}
+
+#define	TX_SINGLE_RING_MODE(mac_srs)				\
+	((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || 	\
+	    (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE ||	\
+	    (mac_srs)->srs_tx.st_mode == SRS_TX_BW)
+
+#define	TX_BANDWIDTH_MODE(mac_srs)				\
+	((mac_srs)->srs_tx.st_mode == SRS_TX_BW ||		\
+	    (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT)
+
+#define	TX_SRS_TO_SOFT_RING(mac_srs, head, hint) {			\
+	uint_t hash, indx;						\
+	hash = HASH_HINT(hint);					\
+	indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);	\
+	softring = mac_srs->srs_oth_soft_rings[indx];			\
+	(void) (mac_tx_soft_ring_process(softring, head, 0, NULL));	\
+}
+
+/*
+ * MAC_TX_SRS_BLOCK
+ *
+ * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED
+ * will be set only if srs_tx_woken_up is FALSE. If
+ * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived
+ * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to
+ * attempt to transmit again and not setting SRS_TX_BLOCKED does
+ * that.
+ */
+#define	MAC_TX_SRS_BLOCK(srs, mp)	{			\
+	ASSERT(MUTEX_HELD(&(srs)->srs_lock));			\
+	if ((srs)->srs_tx.st_woken_up) {			\
+		(srs)->srs_tx.st_woken_up = B_FALSE;		\
+	} else {						\
+		ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED));	\
+		(srs)->srs_state |= SRS_TX_BLOCKED;		\
+		(srs)->srs_tx.st_blocked_cnt++;			\
+	}							\
+}
+
+/*
+ * MAC_TX_SRS_TEST_HIWAT
+ *
+ * Called before queueing a packet onto Tx SRS to test and set
+ * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat.
+ */
+#define	MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) {		\
+	boolean_t enqueue = 1;						\
+									\
+	if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) {		\
+		/*							\
+		 * flow-controlled. Store srs in cookie so that it	\
+		 * can be returned as mac_tx_cookie_t to client		\
+		 */							\
+		(srs)->srs_state |= SRS_TX_HIWAT;			\
+		cookie = (mac_tx_cookie_t)srs;				\
+		(srs)->srs_tx.st_hiwat_cnt++;				\
+		if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) {	\
+			/* increment freed stats */			\
+			(srs)->srs_tx.st_drop_count += cnt;		\
+			/*						\
+			 * b_prev may be set to the fanout hint		\
+			 * hence can't use freemsg directly		\
+			 */						\
+			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);	\
+			DTRACE_PROBE1(tx_queued_hiwat,			\
+			    mac_soft_ring_set_t *, srs);		\
+			enqueue = 0;					\
+		}							\
+	}								\
+	if (enqueue)							\
+		MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz);	\
+}
+
+/* Some utility macros */
+#define	MAC_SRS_BW_LOCK(srs)						\
+	if (!(srs->srs_type & SRST_TX))					\
+		mutex_enter(&srs->srs_bw->mac_bw_lock);
+
+#define	MAC_SRS_BW_UNLOCK(srs)						\
+	if (!(srs->srs_type & SRST_TX))					\
+		mutex_exit(&srs->srs_bw->mac_bw_lock);
+
+#define	MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) {		\
+	mac_pkt_drop(NULL, NULL, mp, B_FALSE);			\
+	/* increment freed stats */				\
+	mac_srs->srs_tx.st_drop_count++;			\
+	cookie = (mac_tx_cookie_t)srs;				\
+}
+
+#define	MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) {		\
+	mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT;			\
+	cookie = (mac_tx_cookie_t)srs;					\
+	*ret_mp = mp_chain;						\
+}
+
+/*
+ * Drop the rx packet and advance to the next one in the chain.
+ */
+static void
+mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp)
+{
+	mac_srs_rx_t	*srs_rx = &srs->srs_rx;
+
+	ASSERT(mp->b_next == NULL);
+	mutex_enter(&srs->srs_lock);
+	MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1);
+	MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp));
+	mutex_exit(&srs->srs_lock);
+
+	srs_rx->sr_drop_count++;
+	freemsg(mp);
+}
+
+/* DATAPATH RUNTIME ROUTINES */
+
+/*
+ * mac_srs_fire
+ *
+ * Timer callback routine for waking up the SRS worker thread.
+ */
+static void
+mac_srs_fire(void *arg)
+{
+	mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg;
+
+	mutex_enter(&mac_srs->srs_lock);
+	if (mac_srs->srs_tid == 0) {
+		mutex_exit(&mac_srs->srs_lock);
+		return;
+	}
+
+	mac_srs->srs_tid = 0;
+	if (!(mac_srs->srs_state & SRS_PROC))
+		cv_signal(&mac_srs->srs_async);
+
+	mutex_exit(&mac_srs->srs_lock);
+}
+
+/*
+ * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack,
+ * and it is used on the TX path.
+ */
+#define	HASH_HINT(hint)	(((hint) << 17) | ((hint) >> 16))
+
+/*
+ * hash based on the src address and the port information.
+ */
+#define	HASH_ADDR(src, ports)					\
+	(ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^	\
+	((ports) >> 8) ^ (ports))
+
+#define	COMPUTE_INDEX(key, sz)	(key % sz)
+
+#define	FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) {	\
+	if ((tail) != NULL) {						\
+		ASSERT((tail)->b_next == NULL);				\
+		(tail)->b_next = (mp);					\
+	} else {							\
+		ASSERT((head) == NULL);					\
+		(head) = (mp);						\
+	}								\
+	(tail) = (mp);							\
+	(cnt)++;							\
+	if ((bw_ctl))							\
+		(sz) += (sz0);						\
+}
+
+#define	MAC_FANOUT_DEFAULT	0
+#define	MAC_FANOUT_RND_ROBIN	1
+int mac_fanout_type = MAC_FANOUT_DEFAULT;
+
+#define	MAX_SR_TYPES	3
+/* fanout types for port based hashing */
+enum pkt_type {
+	V4_TCP = 0,
+	V4_UDP,
+	OTH,
+	UNDEF
+};
+
+/*
+ * In general we do port based hashing to spread traffic over different
+ * softrings. The below tunable allows to override that behavior. Setting it
+ * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
+ * is also the applicable to ipv6 packets carrying multiple optional headers
+ * and other uncommon packet types.
+ */
+boolean_t mac_src_ipv6_fanout = B_FALSE;
+
+/*
+ * Pair of local and remote ports in the transport header
+ */
+#define	PORTS_SIZE 4
+
+/*
+ * mac_rx_srs_proto_fanout
+ *
+ * This routine delivers packets destined to an SRS into one of the
+ * protocol soft rings.
+ *
+ * Given a chain of packets we need to split it up into multiple sub chains
+ * destined into TCP, UDP or OTH soft ring. Instead of entering
+ * the soft ring one packet at a time, we want to enter it in the form of a
+ * chain otherwise we get this start/stop behaviour where the worker thread
+ * goes to sleep and then next packets comes in forcing it to wake up etc.
+ */
+static void
+mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
+{
+	struct ether_header		*ehp;
+	uint16_t			etype;
+	ipha_t				*ipha;
+	mac_soft_ring_t			*softring;
+	size_t				ether_hlen;
+	mblk_t				*mp;
+	mblk_t				*headmp[MAX_SR_TYPES];
+	mblk_t				*tailmp[MAX_SR_TYPES];
+	int				cnt[MAX_SR_TYPES];
+	size_t				sz[MAX_SR_TYPES];
+	size_t				sz1;
+	boolean_t			bw_ctl = B_FALSE;
+	boolean_t			hw_classified;
+	boolean_t			dls_bypass = B_TRUE;
+	enum				pkt_type type;
+	mac_client_impl_t		*mcip = mac_srs->srs_mcip;
+	struct ether_vlan_header	*evhp;
+
+	if (mac_srs->srs_type & SRST_BW_CONTROL)
+		bw_ctl = B_TRUE;
+
+	/*
+	 * If we don't have a Rx ring, S/W classification would have done
+	 * its job and its a packet meant for us. If we were polling on
+	 * the default ring (i.e. there was a ring assigned to this SRS),
+	 * then we need to make sure that the mac address really belongs
+	 * to us.
+	 */
+	hw_classified = mac_srs->srs_ring != NULL &&
+	    mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
+
+	/*
+	 * Special clients (eg. VLAN, non ether, etc) need DLS
+	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
+	 * such SRSs.
+	 */
+	if (!(mac_srs->srs_type & SRST_DLS_BYPASS))
+		dls_bypass = B_FALSE;
+
+	bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
+	bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
+	bzero(cnt, MAX_SR_TYPES * sizeof (int));
+	bzero(sz, MAX_SR_TYPES * sizeof (size_t));
+
+	/*
+	 * We got a chain from SRS that we need to send to the soft rings.
+	 * Since squeues for TCP & IPv4 sap poll their soft rings (for
+	 * performance reasons), we need to separate out v4_tcp, v4_udp
+	 * and the rest goes in other.
+	 */
+	while (head != NULL) {
+		mp = head;
+		head = head->b_next;
+		mp->b_next = NULL;
+
+		type = OTH;
+		sz1 = msgdsize(mp);
+
+		if (!dls_bypass) {
+			mac_impl_t	*mip = mcip->mci_mip;
+
+			ehp = (struct ether_header *)mp->b_rptr;
+
+			/*
+			 * For VLAN packets, if the VLAN id doesn't belong
+			 * to this client, we drop the packet.
+			 */
+			if (mip->mi_info.mi_nativemedia == DL_ETHER &&
+			    ntohs(ehp->ether_type) == VLAN_TPID) {
+				/*
+				 * LINTED: cast may result in improper
+				 * alignment
+				 */
+				evhp = (struct ether_vlan_header *)ehp;
+				if (!mac_client_check_flow_vid(mcip,
+				    VLAN_ID(ntohs(evhp->ether_tci)))) {
+					mac_rx_drop_pkt(mac_srs, mp);
+					continue;
+				}
+			}
+			FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
+			    cnt[type], bw_ctl, sz[type], sz1, mp);
+			continue;
+		}
+
+		/*
+		 * At this point we can be sure the packet at least
+		 * has an ether header.
+		 */
+		if (sz1 < sizeof (struct ether_header)) {
+			mac_rx_drop_pkt(mac_srs, mp);
+			continue;
+		}
+		/* LINTED: cast may result in improper alignment */
+		ehp = (struct ether_header *)mp->b_rptr;
+
+		/*
+		 * Determine if this is a VLAN or non-VLAN packet.
+		 */
+		if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) {
+			/* LINTED: cast may result in improper alignment */
+			evhp = (struct ether_vlan_header *)mp->b_rptr;
+			etype = ntohs(evhp->ether_type);
+			ether_hlen = sizeof (struct ether_vlan_header);
+			/*
+			 * Check if the VID of the packet, if any, belongs
+			 * to this client.
+			 */
+			if (!mac_client_check_flow_vid(mcip,
+			    VLAN_ID(ntohs(evhp->ether_tci)))) {
+				mac_rx_drop_pkt(mac_srs, mp);
+				continue;
+			}
+		} else {
+			ether_hlen = sizeof (struct ether_header);
+		}
+
+		if (etype == ETHERTYPE_IP) {
+			/*
+			 * If we are H/W classified, but we have promisc
+			 * on, then we need to check for the unicast address.
+			 */
+			if (hw_classified && mcip->mci_promisc_list != NULL) {
+				mac_address_t		*map;
+
+				rw_enter(&mcip->mci_rw_lock, RW_READER);
+				map = mcip->mci_unicast;
+				if (bcmp(&ehp->ether_dhost, map->ma_addr,
+				    map->ma_len) == 0)
+					type = UNDEF;
+				rw_exit(&mcip->mci_rw_lock);
+			} else if (((((uint8_t *)&ehp->ether_dhost)[0] &
+			    0x01) == 0)) {
+				type = UNDEF;
+			}
+		}
+
+		/*
+		 * This needs to become a contract with the driver for
+		 * the fast path.
+		 *
+		 * In the normal case the packet will have at least the L2
+		 * header and the IP + Transport header in the same mblk.
+		 * This is usually the case when the NIC driver sends up
+		 * the packet. This is also true when the stack generates
+		 * a packet that is looped back and when the stack uses the
+		 * fastpath mechanism. The normal case is optimized for
+		 * performance and may bypass DLS. All other cases go through
+		 * the 'OTH' type path without DLS bypass.
+		 */
+
+		/* LINTED: cast may result in improper alignment */
+		ipha = (ipha_t *)(mp->b_rptr + ether_hlen);
+		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
+			type = OTH;
+
+		if (type == OTH) {
+			FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
+			    cnt[type], bw_ctl, sz[type], sz1, mp);
+			continue;
+		}
+
+		ASSERT(type == UNDEF);
+		/*
+		 * We look for at least 4 bytes past the IP header to get
+		 * the port information. If we get an IP fragment, we don't
+		 * have the port information, and we use just the protocol
+		 * information.
+		 */
+		switch (ipha->ipha_protocol) {
+		case IPPROTO_TCP:
+			type = V4_TCP;
+			mp->b_rptr += ether_hlen;
+			break;
+		case IPPROTO_UDP:
+			type = V4_UDP;
+			mp->b_rptr += ether_hlen;
+			break;
+		default:
+			type = OTH;
+			break;
+		}
+
+		ASSERT(type != UNDEF);
+
+		FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
+		    bw_ctl, sz[type], sz1, mp);
+	}
+
+	for (type = V4_TCP; type < UNDEF; type++) {
+		if (headmp[type] != NULL) {
+			ASSERT(tailmp[type]->b_next == NULL);
+			switch (type) {
+			case V4_TCP:
+				softring = mac_srs->srs_tcp_soft_rings[0];
+				break;
+			case V4_UDP:
+				softring = mac_srs->srs_udp_soft_rings[0];
+				break;
+			case OTH:
+				softring = mac_srs->srs_oth_soft_rings[0];
+			}
+			mac_rx_soft_ring_process(mac_srs->srs_mcip, softring,
+			    headmp[type], tailmp[type], cnt[type], sz[type]);
+		}
+	}
+}
+
+int	fanout_unalligned = 0;
+
+/*
+ * mac_rx_srs_long_fanout
+ *
+ * The fanout routine for IPv6
+ */
+static int
+mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
+    uint16_t etype, enum pkt_type *type, uint_t *indx)
+{
+	ip6_t		*ip6h;
+	uint8_t		*whereptr;
+	uint_t		hash;
+	uint16_t	remlen;
+	uint8_t		nexthdr;
+	uint16_t	hdr_len;
+
+	if (etype == ETHERTYPE_IPV6) {
+		boolean_t	modifiable = B_TRUE;
+
+		ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
+
+		ip6h = (ip6_t *)(mp->b_rptr + sizeof (struct ether_header));
+		if ((unsigned char *)ip6h == mp->b_wptr) {
+			/*
+			 * The first mblk_t only includes the ethernet header.
+			 * Note that it is safe to change the mp pointer here,
+			 * as the subsequent operation does not assume mp
+			 * points to the start of the ethernet header.
+			 */
+			mp = mp->b_cont;
+
+			/*
+			 * Make sure ip6h holds the full ip6_t structure.
+			 */
+			if (mp == NULL)
+				return (-1);
+
+			if (MBLKL(mp) < IPV6_HDR_LEN) {
+				modifiable = (DB_REF(mp) == 1);
+
+				if (modifiable &&
+				    !pullupmsg(mp, IPV6_HDR_LEN)) {
+					return (-1);
+				}
+			}
+
+			ip6h = (ip6_t *)mp->b_rptr;
+		}
+
+		if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
+		    ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) {
+			/*
+			 * If either ip6h is not alligned, or ip6h does not
+			 * hold the complete ip6_t structure (a pullupmsg()
+			 * is not an option since it would result in an
+			 * unalligned ip6h), fanout to the default ring. Note
+			 * that this may cause packets reordering.
+			 */
+			*indx = 0;
+			*type = OTH;
+			fanout_unalligned++;
+			return (0);
+		}
+
+		remlen = ntohs(ip6h->ip6_plen);
+		nexthdr = ip6h->ip6_nxt;
+
+		if (remlen < MIN_EHDR_LEN)
+			return (-1);
+		/*
+		 * Do src based fanout if below tunable is set to B_TRUE or
+		 * when mac_ip_hdr_length_v6() fails because of malformed
+		 * packets or because mblk's need to be concatenated using
+		 * pullupmsg().
+		 */
+		if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h,
+		    &hdr_len, &nexthdr)) {
+			goto src_based_fanout;
+		}
+		whereptr = (uint8_t *)ip6h + hdr_len;
+
+		/* If the transport is one of below, we do port based fanout */
+		switch (nexthdr) {
+		case IPPROTO_TCP:
+		case IPPROTO_UDP:
+		case IPPROTO_SCTP:
+		case IPPROTO_ESP:
+			/*
+			 * If the ports in the transport header is not part of
+			 * the mblk, do src_based_fanout, instead of calling
+			 * pullupmsg().
+			 */
+			if (mp->b_cont != NULL &&
+			    whereptr + PORTS_SIZE > mp->b_wptr) {
+				goto src_based_fanout;
+			}
+			break;
+		default:
+			break;
+		}
+
+		switch (nexthdr) {
+		case IPPROTO_TCP:
+			hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
+			    *(uint32_t *)whereptr);
+			*indx = COMPUTE_INDEX(hash,
+			    mac_srs->srs_tcp_ring_count);
+			*type = OTH;
+			break;
+
+		case IPPROTO_UDP:
+		case IPPROTO_SCTP:
+		case IPPROTO_ESP:
+			if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
+				hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
+				    *(uint32_t *)whereptr);
+				*indx = COMPUTE_INDEX(hash,
+				    mac_srs->srs_udp_ring_count);
+			} else {
+				*indx = mac_srs->srs_ind %
+				    mac_srs->srs_udp_ring_count;
+				mac_srs->srs_ind++;
+			}
+			*type = OTH;
+			break;
+
+			/* For all other protocol, do source based fanout */
+		default:
+			goto src_based_fanout;
+		}
+	} else {
+		*indx = 0;
+		*type = OTH;
+	}
+	return (0);
+
+src_based_fanout:
+	hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
+	*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
+	*type = OTH;
+	return (0);
+}
+
+/*
+ * mac_rx_srs_fanout
+ *
+ * This routine delivers packets destined to an SRS into a soft ring member
+ * of the set.
+ *
+ * Given a chain of packets we need to split it up into multiple sub chains
+ * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
+ * the soft ring one packet at a time, we want to enter it in the form of a
+ * chain otherwise we get this start/stop behaviour where the worker thread
+ * goes to sleep and then next packets comes in forcing it to wake up etc.
+ *
+ * Note:
+ * Since we know what is the maximum fanout possible, we create a 2D array
+ * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
+ * variables so that we can enter the softrings with chain. We need the
+ * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
+ * for each packet would be expensive). If we ever want to have the
+ * ability to have unlimited fanout, we should probably declare a head,
+ * tail, cnt, sz with each soft ring (a data struct which contains a softring
+ * along with these members) and create an array of this uber struct so we
+ * don't have to do kmem_alloc.
+ */
+int	fanout_oth1 = 0;
+int	fanout_oth2 = 0;
+int	fanout_oth3 = 0;
+int	fanout_oth4 = 0;
+int	fanout_oth5 = 0;
+
+static void
+mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
+{
+	struct ether_header		*ehp;
+	uint16_t			etype;
+	ipha_t				*ipha;
+	uint_t				indx;
+	int				ports_offset = -1;
+	int				ipha_len;
+	uint_t				hash;
+	mac_soft_ring_t			*softring;
+	size_t				ether_hlen;
+	uint16_t			frag_offset_flags;
+	mblk_t				*mp;
+	mblk_t				*headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
+	mblk_t				*tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
+	int				cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
+	size_t				sz[MAX_SR_TYPES][MAX_SR_FANOUT];
+	size_t				sz1;
+	boolean_t			bw_ctl = B_FALSE;
+	boolean_t			hw_classified;
+	boolean_t			dls_bypass = B_TRUE;
+	int				i;
+	int				fanout_cnt;
+	enum 				pkt_type type;
+	mac_client_impl_t		*mcip = mac_srs->srs_mcip;
+	struct ether_vlan_header	*evhp;
+
+	if (mac_srs->srs_type & SRST_BW_CONTROL)
+		bw_ctl = B_TRUE;
+
+	/*
+	 * If we don't have a Rx ring, S/W classification would have done
+	 * its job and its a packet meant for us. If we were polling on
+	 * the default ring (i.e. there was a ring assigned to this SRS),
+	 * then we need to make sure that the mac address really belongs
+	 * to us.
+	 */
+	hw_classified = mac_srs->srs_ring != NULL &&
+	    mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
+
+	/*
+	 * Special clients (eg. VLAN, non ether, etc) need DLS
+	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
+	 * such SRSs.
+	 */
+	if (!(mac_srs->srs_type & SRST_DLS_BYPASS))
+		dls_bypass = B_FALSE;
+
+	/*
+	 * Since the softrings are never destroyed and we always
+	 * create equal number of softrings for TCP, UDP and rest,
+	 * its OK to check one of them for count and use it without
+	 * any lock. In future, if soft rings get destroyed because
+	 * of reduction in fanout, we will need to ensure that happens
+	 * behind the SRS_PROC.
+	 */
+	fanout_cnt = mac_srs->srs_tcp_ring_count;
+
+	bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
+	bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
+	bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
+	bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
+
+	/*
+	 * We got a chain from SRS that we need to send to the soft rings.
+	 * Since squeues for TCP & IPv4 sap poll their soft rings (for
+	 * performance reasons), we need to separate out v4_tcp, v4_udp
+	 * and the rest goes in other.
+	 */
+	while (head != NULL) {
+		mp = head;
+		head = head->b_next;
+		mp->b_next = NULL;
+
+		type = OTH;
+		sz1 = msgdsize(mp);
+
+		if (!dls_bypass) {
+			mac_impl_t	*mip = mcip->mci_mip;
+
+			indx = 0;
+			if (mip->mi_info.mi_nativemedia == DL_ETHER) {
+				ehp = (struct ether_header *)mp->b_rptr;
+				etype = ntohs(ehp->ether_type);
+				/*
+				 * For VLAN packets, if the VLAN id doesn't
+				 * belong to this client, we drop the packet.
+				 */
+				if (etype == VLAN_TPID) {
+					/*
+					 * LINTED: cast may result in improper
+					 * alignment
+					 */
+					evhp = (struct ether_vlan_header *)
+					    mp->b_rptr;
+					if (!mac_client_check_flow_vid(mcip,
+					    VLAN_ID(ntohs(evhp->ether_tci)))) {
+						mac_rx_drop_pkt(mac_srs, mp);
+						continue;
+					}
+				}
+				if (mac_rx_srs_long_fanout(mac_srs, mp, etype,
+				    &type, &indx) == -1) {
+					mac_rx_drop_pkt(mac_srs, mp);
+					continue;
+				}
+			}
+
+			FANOUT_ENQUEUE_MP(headmp[type][indx],
+			    tailmp[type][indx], cnt[type][indx], bw_ctl,
+			    sz[type][indx], sz1, mp);
+			continue;
+		}
+
+		/*
+		 * At this point we can be sure the packet at least
+		 * has an ether header. On the outbound side, GLD/stack
+		 * ensure this. On the inbound side, the driver needs
+		 * to ensure this.
+		 */
+		if (sz1 < sizeof (struct ether_header)) {
+			mac_rx_drop_pkt(mac_srs, mp);
+			continue;
+		}
+		/* LINTED: cast may result in improper alignment */
+		ehp = (struct ether_header *)mp->b_rptr;
+
+		/*
+		 * Determine if this is a VLAN or non-VLAN packet.
+		 */
+		if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) {
+			/* LINTED: cast may result in improper alignment */
+			evhp = (struct ether_vlan_header *)mp->b_rptr;
+			etype = ntohs(evhp->ether_type);
+			ether_hlen = sizeof (struct ether_vlan_header);
+			/*
+			 * Check if the VID of the packet, if any, belongs
+			 * to this client.
+			 */
+			if (!mac_client_check_flow_vid(mcip,
+			    VLAN_ID(ntohs(evhp->ether_tci)))) {
+				mac_rx_drop_pkt(mac_srs, mp);
+				continue;
+			}
+		} else {
+			ether_hlen = sizeof (struct ether_header);
+		}
+
+
+		/*
+		 * If we are using the default Rx ring where H/W or S/W
+		 * classification has not happened, we need to verify if
+		 * this unicast packet really belongs to us.
+		 */
+		if (etype == ETHERTYPE_IP) {
+			/*
+			 * If we are H/W classified, but we have promisc
+			 * on, then we need to check for the unicast address.
+			 */
+			if (hw_classified && mcip->mci_promisc_list != NULL) {
+				mac_address_t		*map;
+
+				rw_enter(&mcip->mci_rw_lock, RW_READER);
+				map = mcip->mci_unicast;
+				if (bcmp(&ehp->ether_dhost, map->ma_addr,
+				    map->ma_len) == 0)
+					type = UNDEF;
+				rw_exit(&mcip->mci_rw_lock);
+			} else if (((((uint8_t *)&ehp->ether_dhost)[0] &
+			    0x01) == 0)) {
+				type = UNDEF;
+			}
+		}
+
+		/*
+		 * This needs to become a contract with the driver for
+		 * the fast path.
+		 */
+
+		/* LINTED: cast may result in improper alignment */
+		ipha = (ipha_t *)(mp->b_rptr + ether_hlen);
+		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
+			type = OTH;
+			fanout_oth1++;
+		}
+
+		if (type != OTH) {
+			switch (ipha->ipha_protocol) {
+			case IPPROTO_TCP:
+			case IPPROTO_UDP:
+			case IPPROTO_SCTP:
+			case IPPROTO_ESP:
+				ipha_len = IPH_HDR_LENGTH(ipha);
+				if ((uchar_t *)ipha + ipha_len + PORTS_SIZE >
+				    mp->b_wptr) {
+					type = OTH;
+					break;
+				}
+				frag_offset_flags =
+				    ntohs(ipha->ipha_fragment_offset_and_flags);
+				if ((frag_offset_flags &
+				    (IPH_MF | IPH_OFFSET)) != 0) {
+					type = OTH;
+					fanout_oth3++;
+					break;
+				}
+				ports_offset = ether_hlen + ipha_len;
+				break;
+			default:
+				type = OTH;
+				fanout_oth4++;
+				break;
+			}
+		}
+
+		if (type == OTH) {
+			if (mac_rx_srs_long_fanout(mac_srs, mp, etype,
+			    &type, &indx) == -1) {
+				mac_rx_drop_pkt(mac_srs, mp);
+				continue;
+			}
+
+			FANOUT_ENQUEUE_MP(headmp[type][indx],
+			    tailmp[type][indx], cnt[type][indx], bw_ctl,
+			    sz[type][indx], sz1, mp);
+			continue;
+		}
+
+		ASSERT(type == UNDEF);
+
+		/*
+		 * XXX-Sunay: We should hold srs_lock since ring_count
+		 * below can change. But if we are always called from
+		 * mac_rx_srs_drain and SRS_PROC is set, then we can
+		 * enforce that ring_count can't be changed i.e.
+		 * to change fanout type or ring count, the calling
+		 * thread needs to be behind SRS_PROC.
+		 */
+		switch (ipha->ipha_protocol) {
+		case IPPROTO_TCP:
+			/*
+			 * Note that for ESP, we fanout on SPI and it is at the
+			 * same offset as the 2x16-bit ports. So it is clumped
+			 * along with TCP, UDP and SCTP.
+			 */
+			hash = HASH_ADDR(ipha->ipha_src,
+			    *(uint32_t *)(mp->b_rptr + ports_offset));
+			indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
+			type = V4_TCP;
+			mp->b_rptr += ether_hlen;
+			break;
+		case IPPROTO_UDP:
+		case IPPROTO_SCTP:
+		case IPPROTO_ESP:
+			if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
+				hash = HASH_ADDR(ipha->ipha_src,
+				    *(uint32_t *)(mp->b_rptr + ports_offset));
+				indx = COMPUTE_INDEX(hash,
+				    mac_srs->srs_udp_ring_count);
+			} else {
+				indx = mac_srs->srs_ind %
+				    mac_srs->srs_udp_ring_count;
+				mac_srs->srs_ind++;
+			}
+			type = V4_UDP;
+			mp->b_rptr += ether_hlen;
+			break;
+		}
+
+		ASSERT(type != UNDEF);
+
+		FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
+		    cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
+	}
+
+	for (type = V4_TCP; type < UNDEF; type++) {
+		for (i = 0; i < fanout_cnt; i++) {
+			if (headmp[type][i] != NULL) {
+				ASSERT(tailmp[type][i]->b_next == NULL);
+				switch (type) {
+				case V4_TCP:
+					softring =
+					    mac_srs->srs_tcp_soft_rings[i];
+					break;
+				case V4_UDP:
+					softring =
+					    mac_srs->srs_udp_soft_rings[i];
+					break;
+				case OTH:
+					softring =
+					    mac_srs->srs_oth_soft_rings[i];
+					break;
+				}
+				mac_rx_soft_ring_process(mac_srs->srs_mcip,
+				    softring, headmp[type][i], tailmp[type][i],
+				    cnt[type][i], sz[type][i]);
+			}
+		}
+	}
+}
+
+#define	SRS_BYTES_TO_PICKUP	150000
+ssize_t	max_bytes_to_pickup = SRS_BYTES_TO_PICKUP;
+
+/*
+ * mac_rx_srs_poll_ring
+ *
+ * This SRS Poll thread uses this routine to poll the underlying hardware
+ * Rx ring to get a chain of packets. It can inline process that chain
+ * if mac_latency_optimize is set (default) or signal the SRS worker thread
+ * to do the remaining processing.
+ *
+ * Since packets come in the system via interrupt or poll path, we also
+ * update the stats and deal with promiscous clients here.
+ */
+void
+mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs)
+{
+	kmutex_t 		*lock = &mac_srs->srs_lock;
+	kcondvar_t 		*async = &mac_srs->srs_cv;
+	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
+	mblk_t 			*head, *tail, *mp;
+	callb_cpr_t 		cprinfo;
+	ssize_t 		bytes_to_pickup;
+	size_t 			sz;
+	int			count;
+	mac_client_impl_t	*smcip;
+
+	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll");
+	mutex_enter(lock);
+
+start:
+	for (;;) {
+		if (mac_srs->srs_state & SRS_PAUSE)
+			goto done;
+
+		CALLB_CPR_SAFE_BEGIN(&cprinfo);
+		cv_wait(async, lock);
+		CALLB_CPR_SAFE_END(&cprinfo, lock);
+
+		if (mac_srs->srs_state & SRS_PAUSE)
+			goto done;
+
+check_again:
+		if (mac_srs->srs_type & SRST_BW_CONTROL) {
+			/*
+			 * We pick as many bytes as we are allowed to queue.
+			 * Its possible that we will exceed the total
+			 * packets queued in case this SRS is part of the
+			 * Rx ring group since > 1 poll thread can be pulling
+			 * upto the max allowed packets at the same time
+			 * but that should be OK.
+			 */
+			mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+			bytes_to_pickup =
+			    mac_srs->srs_bw->mac_bw_drop_threshold -
+			    mac_srs->srs_bw->mac_bw_sz;
+			/*
+			 * We shouldn't have been signalled if we
+			 * have 0 or less bytes to pick but since
+			 * some of the bytes accounting is driver
+			 * dependant, we do the safety check.
+			 */
+			if (bytes_to_pickup < 0)
+				bytes_to_pickup = 0;
+			mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+		} else {
+			/*
+			 * ToDO: Need to change the polling API
+			 * to add a packet count and a flag which
+			 * tells the driver whether we want packets
+			 * based on a count, or bytes, or all the
+			 * packets queued in the driver/HW. This
+			 * way, we never have to check the limits
+			 * on poll path. We truly let only as many
+			 * packets enter the system as we are willing
+			 * to process or queue.
+			 *
+			 * Something along the lines of
+			 * pkts_to_pickup = mac_soft_ring_max_q_cnt -
+			 *	mac_srs->srs_poll_pkt_cnt
+			 */
+
+			/*
+			 * Since we are not doing B/W control, pick
+			 * as many packets as allowed.
+			 */
+			bytes_to_pickup = max_bytes_to_pickup;
+		}
+
+		/* Poll the underlying Hardware */
+		mutex_exit(lock);
+		head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup);
+		mutex_enter(lock);
+
+		ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
+		    SRS_POLL_THR_OWNER);
+
+		mp = tail = head;
+		count = 0;
+		sz = 0;
+		while (mp != NULL) {
+			tail = mp;
+			sz += msgdsize(mp);
+			mp = mp->b_next;
+			count++;
+		}
+
+		if (head != NULL) {
+			tail->b_next = NULL;
+			smcip = mac_srs->srs_mcip;
+
+			if ((mac_srs->srs_type & SRST_FLOW) ||
+			    (smcip == NULL)) {
+				FLOW_STAT_UPDATE(mac_srs->srs_flent,
+				    rbytes, sz);
+				FLOW_STAT_UPDATE(mac_srs->srs_flent,
+				    ipackets, count);
+			}
+
+			/*
+			 * If there are any promiscuous mode callbacks
+			 * defined for this MAC client, pass them a copy
+			 * if appropriate and also update the counters.
+			 */
+			if (smcip != NULL) {
+				smcip->mci_stat_ibytes += sz;
+				smcip->mci_stat_ipackets += count;
+
+				if (smcip->mci_mip->mi_promisc_list != NULL) {
+					mutex_exit(lock);
+					mac_promisc_dispatch(smcip->mci_mip,
+					    head, NULL);
+					mutex_enter(lock);
+				}
+			}
+			if (mac_srs->srs_type & SRST_BW_CONTROL) {
+				mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+				mac_srs->srs_bw->mac_bw_polled += sz;
+				mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+			}
+			srs_rx->sr_poll_count += count;
+			MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail,
+			    count, sz);
+			if (count <= 10)
+				srs_rx->sr_chain_cnt_undr10++;
+			else if (count > 10 && count <= 50)
+				srs_rx->sr_chain_cnt_10to50++;
+			else
+				srs_rx->sr_chain_cnt_over50++;
+		}
+
+		/*
+		 * We are guaranteed that SRS_PROC will be set if we
+		 * are here. Also, poll thread gets to run only if
+		 * the drain was being done by a worker thread although
+		 * its possible that worker thread is still running
+		 * and poll thread was sent down to keep the pipeline
+		 * going instead of doing a complete drain and then
+		 * trying to poll the NIC.
+		 *
+		 * So we need to check SRS_WORKER flag to make sure
+		 * that the worker thread is not processing the queue
+		 * in parallel to us. The flags and conditions are
+		 * protected by the srs_lock to prevent any race. We
+		 * ensure that we don't drop the srs_lock from now
+		 * till the end and similarly we don't drop the srs_lock
+		 * in mac_rx_srs_drain() till similar condition check
+		 * are complete. The mac_rx_srs_drain() needs to ensure
+		 * that SRS_WORKER flag remains set as long as its
+		 * processing the queue.
+		 */
+		if (!(mac_srs->srs_state & SRS_WORKER) &&
+		    (mac_srs->srs_first != NULL)) {
+			/*
+			 * We have packets to process and worker thread
+			 * is not running.  Check to see if poll thread is
+			 * allowed to process. Let it do processing only if it
+			 * picked up some packets from the NIC otherwise
+			 * wakeup the worker thread.
+			 */
+			if ((mac_srs->srs_state & SRS_LATENCY_OPT) &&
+			    (head != NULL)) {
+				mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
+				if (srs_rx->sr_poll_pkt_cnt <=
+				    srs_rx->sr_lowat) {
+					srs_rx->sr_poll_again++;
+					goto check_again;
+				} else {
+					/*
+					 * We are already above low water mark
+					 * so stay in the polling mode but no
+					 * need to poll. Once we dip below
+					 * the polling threshold, the processing
+					 * thread (soft ring) will signal us
+					 * to poll again (MAC_UPDATE_SRS_COUNT)
+					 */
+					srs_rx->sr_poll_drain_no_poll++;
+					mac_srs->srs_state &=
+					    ~(SRS_PROC|SRS_GET_PKTS);
+					/*
+					 * In B/W control case, its possible
+					 * that the backlog built up due to
+					 * B/W limit being reached and packets
+					 * are queued only in SRS. In this case,
+					 * we should schedule worker thread
+					 * since no one else will wake us up.
+					 */
+					if ((mac_srs->srs_type &
+					    SRST_BW_CONTROL) &&
+					    (mac_srs->srs_tid == NULL)) {
+						mac_srs->srs_tid =
+						    timeout(mac_srs_fire,
+						    mac_srs, 1);
+						srs_rx->sr_poll_worker_wakeup++;
+					}
+				}
+			} else {
+				/*
+				 * Wakeup the worker thread for more processing.
+				 * We optimize for throughput in this case.
+				 */
+				mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
+				MAC_SRS_WORKER_WAKEUP(mac_srs);
+				srs_rx->sr_poll_sig_worker++;
+			}
+		} else if ((mac_srs->srs_first == NULL) &&
+		    !(mac_srs->srs_state & SRS_WORKER)) {
+			/*
+			 * There is nothing queued in SRS and
+			 * no worker thread running. Plus we
+			 * didn't get anything from the H/W
+			 * as well (head == NULL);
+			 */
+			ASSERT(head == NULL);
+			mac_srs->srs_state &=
+			    ~(SRS_PROC|SRS_GET_PKTS);
+
+			/*
+			 * If we have a packets in soft ring, don't allow
+			 * more packets to come into this SRS by keeping the
+			 * interrupts off but not polling the H/W. The
+			 * poll thread will get signaled as soon as
+			 * srs_poll_pkt_cnt dips below poll threshold.
+			 */
+			if (srs_rx->sr_poll_pkt_cnt == 0) {
+				srs_rx->sr_poll_intr_enable++;
+				MAC_SRS_POLLING_OFF(mac_srs);
+			} else {
+				/*
+				 * We know nothing is queued in SRS
+				 * since we are here after checking
+				 * srs_first is NULL. The backlog
+				 * is entirely due to packets queued
+				 * in Soft ring which will wake us up
+				 * and get the interface out of polling
+				 * mode once the backlog dips below
+				 * sr_poll_thres.
+				 */
+				srs_rx->sr_poll_no_poll++;
+			}
+		} else {
+			/*
+			 * Worker thread is already running.
+			 * Nothing much to do. If the polling
+			 * was enabled, worker thread will deal
+			 * with that.
+			 */
+			mac_srs->srs_state &= ~SRS_GET_PKTS;
+			srs_rx->sr_poll_goto_sleep++;
+		}
+	}
+done:
+	mac_srs->srs_state |= SRS_POLL_THR_QUIESCED;
+	cv_signal(&mac_srs->srs_async);
+	/*
+	 * If this is a temporary quiesce then wait for the restart signal
+	 * from the srs worker. Then clear the flags and signal the srs worker
+	 * to ensure a positive handshake and go back to start.
+	 */
+	while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART)))
+		cv_wait(async, lock);
+	if (mac_srs->srs_state & SRS_POLL_THR_RESTART) {
+		ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
+		mac_srs->srs_state &=
+		    ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART);
+		cv_signal(&mac_srs->srs_async);
+		goto start;
+	} else {
+		mac_srs->srs_state |= SRS_POLL_THR_EXITED;
+		cv_signal(&mac_srs->srs_async);
+		CALLB_CPR_EXIT(&cprinfo);
+		thread_exit();
+	}
+}
+
+/*
+ * mac_srs_pick_chain
+ *
+ * In Bandwidth control case, checks how many packets can be processed
+ * and return them in a sub chain.
+ */
+static mblk_t *
+mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail,
+    size_t *chain_sz, int *chain_cnt)
+{
+	mblk_t 			*head = NULL;
+	mblk_t 			*tail = NULL;
+	size_t			sz;
+	size_t 			tsz = 0;
+	int			cnt = 0;
+	mblk_t 			*mp;
+
+	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
+	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+	if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <=
+	    mac_srs->srs_bw->mac_bw_limit) ||
+	    (mac_srs->srs_bw->mac_bw_limit == 0)) {
+		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+		head = mac_srs->srs_first;
+		mac_srs->srs_first = NULL;
+		*chain_tail = mac_srs->srs_last;
+		mac_srs->srs_last = NULL;
+		*chain_sz = mac_srs->srs_size;
+		*chain_cnt = mac_srs->srs_count;
+		mac_srs->srs_count = 0;
+		mac_srs->srs_size = 0;
+		return (head);
+	}
+
+	/*
+	 * Can't clear the entire backlog.
+	 * Need to find how many packets to pick
+	 */
+	ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock));
+	while ((mp = mac_srs->srs_first) != NULL) {
+		sz = msgdsize(mp);
+		if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) >
+		    mac_srs->srs_bw->mac_bw_limit) {
+			if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED))
+				mac_srs->srs_bw->mac_bw_state |=
+				    SRS_BW_ENFORCED;
+			break;
+		}
+
+		/*
+		 * The _size & cnt is  decremented from the softrings
+		 * when they send up the packet for polling to work
+		 * properly.
+		 */
+		tsz += sz;
+		cnt++;
+		mac_srs->srs_count--;
+		mac_srs->srs_size -= sz;
+		if (tail != NULL)
+			tail->b_next = mp;
+		else
+			head = mp;
+		tail = mp;
+		mac_srs->srs_first = mac_srs->srs_first->b_next;
+	}
+	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+	if (mac_srs->srs_first == NULL)
+		mac_srs->srs_last = NULL;
+
+	if (tail != NULL)
+		tail->b_next = NULL;
+	*chain_tail = tail;
+	*chain_cnt = cnt;
+	*chain_sz = tsz;
+
+	return (head);
+}
+
+/*
+ * mac_rx_srs_drain
+ *
+ * The SRS drain routine. Gets to run to clear the queue. Any thread
+ * (worker, interrupt, poll) can call this based on processing model.
+ * The first thing we do is disable interrupts if possible and then
+ * drain the queue. we also try to poll the underlying hardware if
+ * there is a dedicated hardware Rx ring assigned to this SRS.
+ *
+ * There is a equivalent drain routine in bandwidth control mode
+ * mac_rx_srs_drain_bw. There is some code duplication between the two
+ * routines but they are highly performance sensitive and are easier
+ * to read/debug if they stay separate. Any code changes here might
+ * also apply to mac_rx_srs_drain_bw as well.
+ */
+void
+mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
+{
+	mblk_t 			*head;
+	mblk_t			*tail;
+	timeout_id_t 		tid;
+	int			cnt = 0;
+	mac_client_impl_t	*mcip = mac_srs->srs_mcip;
+	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
+
+	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
+	ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
+again:
+	/* If we are blanked i.e. can't do upcalls, then we are done */
+	if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
+		ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
+		    (mac_srs->srs_state & SRS_PAUSE));
+		goto out;
+	}
+
+	if (mac_srs->srs_first == NULL)
+		goto out;
+
+	head = mac_srs->srs_first;
+	mac_srs->srs_first = NULL;
+	tail = mac_srs->srs_last;
+	mac_srs->srs_last = NULL;
+	cnt = mac_srs->srs_count;
+	mac_srs->srs_count = 0;
+
+	ASSERT(head != NULL);
+	ASSERT(tail != NULL);
+
+	if ((tid = mac_srs->srs_tid) != 0)
+		mac_srs->srs_tid = 0;
+
+	mac_srs->srs_state |= (SRS_PROC|proc_type);
+
+	/* Switch to polling mode */
+	MAC_SRS_WORKER_POLLING_ON(mac_srs);
+	if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
+		MAC_SRS_POLL_RING(mac_srs);
+	/*
+	 * mcip is NULL for broadcast and multicast flows. The promisc
+	 * callbacks for broadcast and multicast packets are delivered from
+	 * mac_rx() and we don't need to worry about that case in this path
+	 */
+	if (mcip != NULL && mcip->mci_promisc_list != NULL) {
+		mutex_exit(&mac_srs->srs_lock);
+		mac_promisc_client_dispatch(mcip, head);
+		mutex_enter(&mac_srs->srs_lock);
+	}
+
+	/*
+	 * Check if SRS itself is doing the processing
+	 * This direct path does not apply when subflows are present. In this
+	 * case, packets need to be dispatched to a soft ring according to the
+	 * flow's bandwidth and other resources contraints.
+	 */
+	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
+		mac_direct_rx_t		proc;
+		void			*arg1;
+		mac_resource_handle_t	arg2;
+
+		/*
+		 * This is the case when a Rx is directly
+		 * assigned and we have a fully classified
+		 * protocol chain. We can deal with it in
+		 * one shot.
+		 */
+		proc = srs_rx->sr_func;
+		arg1 = srs_rx->sr_arg1;
+		arg2 = srs_rx->sr_arg2;
+
+		mac_srs->srs_state |= SRS_CLIENT_PROC;
+		mutex_exit(&mac_srs->srs_lock);
+		if (tid != 0) {
+			(void) untimeout(tid);
+			tid = 0;
+		}
+
+		proc(arg1, arg2, head, NULL);
+		/*
+		 * Decrement the size and count here itelf
+		 * since the packet has been processed.
+		 */
+		mutex_enter(&mac_srs->srs_lock);
+		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
+		if (mac_srs->srs_state & SRS_CLIENT_WAIT)
+			cv_signal(&mac_srs->srs_client_cv);
+		mac_srs->srs_state &= ~SRS_CLIENT_PROC;
+	} else {
+		/* Some kind of softrings based fanout is required */
+		mutex_exit(&mac_srs->srs_lock);
+		if (tid != 0) {
+			(void) untimeout(tid);
+			tid = 0;
+		}
+
+		/*
+		 * Since the fanout routines can deal with chains,
+		 * shoot the entire chain up.
+		 */
+		if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
+			mac_rx_srs_fanout(mac_srs, head);
+		else
+			mac_rx_srs_proto_fanout(mac_srs, head);
+		mutex_enter(&mac_srs->srs_lock);
+	}
+
+	/*
+	 * Send the poll thread to pick up any packets arrived
+	 * so far. This also serves as the last check in case
+	 * nothing else is queued in the SRS. The poll thread
+	 * is signalled only in the case the drain was done
+	 * by the worker thread and SRS_WORKER is set. The
+	 * worker thread can run in parallel as long as the
+	 * SRS_WORKER flag is set. We we have nothing else to
+	 * process, we can exit while leaving SRS_PROC set
+	 * which gives the poll thread control to process and
+	 * cleanup once it returns from the NIC.
+	 *
+	 * If we have nothing else to process, we need to
+	 * ensure that we keep holding the srs_lock till
+	 * all the checks below are done and control is
+	 * handed to the poll thread if it was running.
+	 */
+	if (mac_srs->srs_first != NULL) {
+		if (proc_type == SRS_WORKER) {
+			if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
+				MAC_SRS_POLL_RING(mac_srs);
+			srs_rx->sr_drain_again++;
+			goto again;
+		} else {
+			srs_rx->sr_drain_worker_sig++;
+			cv_signal(&mac_srs->srs_async);
+		}
+	}
+
+out:
+
+	if (mac_srs->srs_state & SRS_GET_PKTS) {
+		/*
+		 * Poll thread is already running. Leave the
+		 * SRS_RPOC set and hand over the control to
+		 * poll thread.
+		 */
+		mac_srs->srs_state &= ~proc_type;
+		srs_rx->sr_drain_poll_running++;
+		return;
+	}
+
+	/*
+	 * Even if there are no packets queued in SRS, we
+	 * need to make sure that the shared counter is
+	 * clear and any associated softrings have cleared
+	 * all the backlog. Otherwise, leave the interface
+	 * in polling mode and the poll thread will get
+	 * signalled once the count goes down to zero.
+	 *
+	 * If someone is already draining the queue (SRS_PROC is
+	 * set) when the srs_poll_pkt_cnt goes down to zero,
+	 * then it means that drain is already running and we
+	 * will turn off polling at that time if there is
+	 * no backlog.
+	 *
+	 * As long as there are packets queued either
+	 * in soft ring set or its soft rings, we will leave
+	 * the interface in polling mode (even if the drain
+	 * was done being the interrupt thread). We signal
+	 * the poll thread as well if we have dipped below
+	 * low water mark.
+	 *
+	 * NOTE: We can't use the MAC_SRS_POLLING_ON macro
+	 * since that turn polling on only for worker thread.
+	 * Its not worth turning polling on for interrupt
+	 * thread (since NIC will not issue another interrupt)
+	 * unless a backlog builds up.
+	 */
+	if ((srs_rx->sr_poll_pkt_cnt > 0) &&
+	    (mac_srs->srs_state & SRS_POLLING_CAPAB)) {
+		mac_srs->srs_state &= ~(SRS_PROC|proc_type);
+		srs_rx->sr_drain_keep_polling++;
+		MAC_SRS_POLLING_ON(mac_srs);
+		if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
+			MAC_SRS_POLL_RING(mac_srs);
+		return;
+	}
+
+	/* Nothing else to do. Get out of poll mode */
+	MAC_SRS_POLLING_OFF(mac_srs);
+	mac_srs->srs_state &= ~(SRS_PROC|proc_type);
+	srs_rx->sr_drain_finish_intr++;
+}
+
+/*
+ * mac_rx_srs_drain_bw
+ *
+ * The SRS BW drain routine. Gets to run to clear the queue. Any thread
+ * (worker, interrupt, poll) can call this based on processing model.
+ * The first thing we do is disable interrupts if possible and then
+ * drain the queue. we also try to poll the underlying hardware if
+ * there is a dedicated hardware Rx ring assigned to this SRS.
+ *
+ * There is a equivalent drain routine in non bandwidth control mode
+ * mac_rx_srs_drain. There is some code duplication between the two
+ * routines but they are highly performance sensitive and are easier
+ * to read/debug if they stay separate. Any code changes here might
+ * also apply to mac_rx_srs_drain as well.
+ */
+void
+mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
+{
+	mblk_t 			*head;
+	mblk_t			*tail;
+	timeout_id_t 		tid;
+	size_t			sz = 0;
+	int			cnt = 0;
+	mac_client_impl_t	*mcip = mac_srs->srs_mcip;
+	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
+
+	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
+	ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
+again:
+	/* Check if we are doing B/W control */
+	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+	if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
+		mac_srs->srs_bw->mac_bw_curr_time = lbolt;
+		mac_srs->srs_bw->mac_bw_used = 0;
+		if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
+			mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED;
+	} else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) {
+		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+		goto done;
+	} else if (mac_srs->srs_bw->mac_bw_used >
+	    mac_srs->srs_bw->mac_bw_limit) {
+		mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
+		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+		goto done;
+	}
+	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+
+	/* If we are blanked i.e. can't do upcalls, then we are done */
+	if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
+		ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
+		    (mac_srs->srs_state & SRS_PAUSE));
+		goto done;
+	}
+
+	sz = 0;
+	cnt = 0;
+	if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) {
+		/*
+		 * We couldn't pick up a single packet.
+		 */
+		mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+		if ((mac_srs->srs_bw->mac_bw_used == 0) &&
+		    (mac_srs->srs_size != 0) &&
+		    !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
+			/*
+			 * Seems like configured B/W doesn't
+			 * even allow processing of 1 packet
+			 * per tick.
+			 *
+			 * XXX: raise the limit to processing
+			 * at least 1 packet per tick.
+			 */
+			mac_srs->srs_bw->mac_bw_limit +=
+			    mac_srs->srs_bw->mac_bw_limit;
+			mac_srs->srs_bw->mac_bw_drop_threshold +=
+			    mac_srs->srs_bw->mac_bw_drop_threshold;
+			cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) "
+			    "raised B/W limit to %d since not even a "
+			    "single packet can be processed per "
+			    "tick %d\n", (void *)mac_srs,
+			    (int)mac_srs->srs_bw->mac_bw_limit,
+			    (int)msgdsize(mac_srs->srs_first));
+		}
+		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+		goto done;
+	}
+
+	ASSERT(head != NULL);
+	ASSERT(tail != NULL);
+
+	/* zero bandwidth: drop all and return to interrupt mode */
+	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+	if (mac_srs->srs_bw->mac_bw_limit == 0) {
+		srs_rx->sr_drop_count += cnt;
+		ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz);
+		mac_srs->srs_bw->mac_bw_sz -= sz;
+		mac_srs->srs_bw->mac_bw_drop_bytes += sz;
+		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+		mac_pkt_drop(NULL, NULL, head, B_FALSE);
+		goto leave_poll;
+	} else {
+		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+	}
+
+	/*
+	 * We can continue processing the queue.
+	 * We need to figure out if there is a fanout needed or
+	 * we can just process this here.
+	 */
+
+	if ((tid = mac_srs->srs_tid) != 0)
+		mac_srs->srs_tid = 0;
+
+	mac_srs->srs_state |= (SRS_PROC|proc_type);
+	MAC_SRS_WORKER_POLLING_ON(mac_srs);
+
+	/*
+	 * mcip is NULL for broadcast and multicast flows. The promisc
+	 * callbacks for broadcast and multicast packets are delivered from
+	 * mac_rx() and we don't need to worry about that case in this path
+	 */
+	if (mcip != NULL && mcip->mci_promisc_list != NULL) {
+		mutex_exit(&mac_srs->srs_lock);
+		mac_promisc_client_dispatch(mcip, head);
+		mutex_enter(&mac_srs->srs_lock);
+	}
+
+	/*
+	 * Check if SRS itself is doing the processing
+	 * This direct path does not apply when subflows are present. In this
+	 * case, packets need to be dispatched to a soft ring according to the
+	 * flow's bandwidth and other resources contraints.
+	 */
+	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
+		mac_direct_rx_t		proc;
+		void			*arg1;
+		mac_resource_handle_t	arg2;
+
+		/*
+		 * This is the case when a Rx is directly
+		 * assigned and we have a fully classified
+		 * protocol chain. We can deal with it in
+		 * one shot.
+		 */
+		proc = srs_rx->sr_func;
+		arg1 = srs_rx->sr_arg1;
+		arg2 = srs_rx->sr_arg2;
+
+		mac_srs->srs_state |= SRS_CLIENT_PROC;
+		mutex_exit(&mac_srs->srs_lock);
+		if (tid != 0) {
+			(void) untimeout(tid);
+			tid = 0;
+		}
+
+		proc(arg1, arg2, head, NULL);
+		/*
+		 * Decrement the size and count here itelf
+		 * since the packet has been processed.
+		 */
+		mutex_enter(&mac_srs->srs_lock);
+		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
+		MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
+
+		if (mac_srs->srs_state & SRS_CLIENT_WAIT)
+			cv_signal(&mac_srs->srs_client_cv);
+		mac_srs->srs_state &= ~SRS_CLIENT_PROC;
+	} else {
+		/* Some kind of softrings based fanout is required */
+		mutex_exit(&mac_srs->srs_lock);
+		if (tid != 0) {
+			(void) untimeout(tid);
+			tid = 0;
+		}
+
+		/*
+		 * Since the fanout routines can deal with chains,
+		 * shoot the entire chain up.
+		 */
+		if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
+			mac_rx_srs_fanout(mac_srs, head);
+		else
+			mac_rx_srs_proto_fanout(mac_srs, head);
+		mutex_enter(&mac_srs->srs_lock);
+	}
+
+	/*
+	 * Send the poll thread to pick up any packets arrived
+	 * so far. This also serves as the last check in case
+	 * nothing else is queued in the SRS. The poll thread
+	 * is signalled only in the case the drain was done
+	 * by the worker thread and SRS_WORKER is set. The
+	 * worker thread can run in parallel as long as the
+	 * SRS_WORKER flag is set. We we have nothing else to
+	 * process, we can exit while leaving SRS_PROC set
+	 * which gives the poll thread control to process and
+	 * cleanup once it returns from the NIC.
+	 *
+	 * If we have nothing else to process, we need to
+	 * ensure that we keep holding the srs_lock till
+	 * all the checks below are done and control is
+	 * handed to the poll thread if it was running.
+	 */
+	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+	if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
+		if (mac_srs->srs_first != NULL) {
+			if (proc_type == SRS_WORKER) {
+				mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+				if (srs_rx->sr_poll_pkt_cnt <=
+				    srs_rx->sr_lowat)
+					MAC_SRS_POLL_RING(mac_srs);
+				goto again;
+			} else {
+				cv_signal(&mac_srs->srs_async);
+			}
+		}
+	}
+	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+
+done:
+
+	if (mac_srs->srs_state & SRS_GET_PKTS) {
+		/*
+		 * Poll thread is already running. Leave the
+		 * SRS_RPOC set and hand over the control to
+		 * poll thread.
+		 */
+		mac_srs->srs_state &= ~proc_type;
+		return;
+	}
+
+	/*
+	 * If we can't process packets because we have exceeded
+	 * B/W limit for this tick, just set the timeout
+	 * and leave.
+	 *
+	 * Even if there are no packets queued in SRS, we
+	 * need to make sure that the shared counter is
+	 * clear and any associated softrings have cleared
+	 * all the backlog. Otherwise, leave the interface
+	 * in polling mode and the poll thread will get
+	 * signalled once the count goes down to zero.
+	 *
+	 * If someone is already draining the queue (SRS_PROC is
+	 * set) when the srs_poll_pkt_cnt goes down to zero,
+	 * then it means that drain is already running and we
+	 * will turn off polling at that time if there is
+	 * no backlog. As long as there are packets queued either
+	 * is soft ring set or its soft rings, we will leave
+	 * the interface in polling mode.
+	 */
+	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
+	if ((mac_srs->srs_state & SRS_POLLING_CAPAB) &&
+	    ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) ||
+	    (srs_rx->sr_poll_pkt_cnt > 0))) {
+		MAC_SRS_POLLING_ON(mac_srs);
+		mac_srs->srs_state &= ~(SRS_PROC|proc_type);
+		if ((mac_srs->srs_first != NULL) &&
+		    (mac_srs->srs_tid == NULL))
+			mac_srs->srs_tid = timeout(mac_srs_fire,
+			    mac_srs, 1);
+		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+		return;
+	}
+	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
+
+leave_poll:
+
+	/* Nothing else to do. Get out of poll mode */
+	MAC_SRS_POLLING_OFF(mac_srs);
+	mac_srs->srs_state &= ~(SRS_PROC|proc_type);
+}
+
+/*
+ * mac_srs_worker
+ *
+ * The SRS worker routine. Drains the queue when no one else is
+ * processing it.
+ */
+void
+mac_srs_worker(mac_soft_ring_set_t *mac_srs)
+{
+	kmutex_t 		*lock = &mac_srs->srs_lock;
+	kcondvar_t 		*async = &mac_srs->srs_async;
+	callb_cpr_t		cprinfo;
+	boolean_t		bw_ctl_flag;
+
+	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker");
+	mutex_enter(lock);
+
+start:
+	for (;;) {
+		bw_ctl_flag = B_FALSE;
+		if (mac_srs->srs_type & SRST_BW_CONTROL) {
+			MAC_SRS_BW_LOCK(mac_srs);
+			MAC_SRS_CHECK_BW_CONTROL(mac_srs);
+			if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
+				bw_ctl_flag = B_TRUE;
+			MAC_SRS_BW_UNLOCK(mac_srs);
+		}
+		/*
+		 * The SRS_BW_ENFORCED flag may change since we have dropped
+		 * the mac_bw_lock. However the drain function can handle both
+		 * a drainable SRS or a bandwidth controlled SRS, and the
+		 * effect of scheduling a timeout is to wakeup the worker
+		 * thread which in turn will call the drain function. Since
+		 * we release the srs_lock atomically only in the cv_wait there
+		 * isn't a fear of waiting for ever.
+		 */
+		while (((mac_srs->srs_state & SRS_PROC) ||
+		    (mac_srs->srs_first == NULL) || bw_ctl_flag ||
+		    (mac_srs->srs_state & SRS_TX_BLOCKED)) &&
+		    !(mac_srs->srs_state & SRS_PAUSE)) {
+			/*
+			 * If we have packets queued and we are here
+			 * because B/W control is in place, we better
+			 * schedule the worker wakeup after 1 tick
+			 * to see if bandwidth control can be relaxed.
+			 */
+			if (bw_ctl_flag && mac_srs->srs_tid == NULL) {
+				/*
+				 * We need to ensure that a timer  is already
+				 * scheduled or we force  schedule one for
+				 * later so that we can continue processing
+				 * after this  quanta is over.
+				 */
+				mac_srs->srs_tid = timeout(mac_srs_fire,
+				    mac_srs, 1);
+			}
+wait:
+			CALLB_CPR_SAFE_BEGIN(&cprinfo);
+			cv_wait(async, lock);
+			CALLB_CPR_SAFE_END(&cprinfo, lock);
+
+			if (mac_srs->srs_state & SRS_PAUSE)
+				goto done;
+			if (mac_srs->srs_state & SRS_PROC)
+				goto wait;
+
+			if (mac_srs->srs_first != NULL &&
+			    mac_srs->srs_type & SRST_BW_CONTROL) {
+				MAC_SRS_BW_LOCK(mac_srs);
+				if (mac_srs->srs_bw->mac_bw_state &
+				    SRS_BW_ENFORCED) {
+					MAC_SRS_CHECK_BW_CONTROL(mac_srs);
+				}
+				bw_ctl_flag = mac_srs->srs_bw->mac_bw_state &
+				    SRS_BW_ENFORCED;
+				MAC_SRS_BW_UNLOCK(mac_srs);
+			}
+		}
+
+		if (mac_srs->srs_state & SRS_PAUSE)
+			goto done;
+		mac_srs->srs_drain_func(mac_srs, SRS_WORKER);
+	}
+done:
+	/*
+	 * The Rx SRS quiesce logic first cuts off packet supply to the SRS
+	 * from both hard and soft classifications and waits for such threads
+	 * to finish before signaling the worker. So at this point the only
+	 * thread left that could be competing with the worker is the poll
+	 * thread. In the case of Tx, there shouldn't be any thread holding
+	 * SRS_PROC at this point.
+	 */
+	if (!(mac_srs->srs_state & SRS_PROC)) {
+		mac_srs->srs_state |= SRS_PROC;
+	} else {
+		ASSERT((mac_srs->srs_type & SRST_TX) == 0);
+		/*
+		 * Poll thread still owns the SRS and is still running
+		 */
+		ASSERT((mac_srs->srs_poll_thr == NULL) ||
+		    ((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
+		    SRS_POLL_THR_OWNER));
+	}
+	mac_srs_worker_quiesce(mac_srs);
+	/*
+	 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator
+	 * of the quiesce operation
+	 */
+	while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART)))
+		cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
+
+	if (mac_srs->srs_state & SRS_RESTART) {
+		ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
+		mac_srs_worker_restart(mac_srs);
+		mac_srs->srs_state &= ~SRS_PROC;
+		goto start;
+	}
+
+	if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE))
+		mac_srs_worker_quiesce(mac_srs);
+
+	mac_srs->srs_state &= ~SRS_PROC;
+	/* The macro drops the srs_lock */
+	CALLB_CPR_EXIT(&cprinfo);
+	thread_exit();
+}
+
+/*
+ * mac_rx_srs_subflow_process
+ *
+ * Receive side routine called from interrupt path when there are
+ * sub flows present on this SRS.
+ */
+/* ARGSUSED */
+void
+mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
+    mblk_t *mp_chain, boolean_t loopback)
+{
+	flow_entry_t		*flent = NULL;
+	flow_entry_t		*prev_flent = NULL;
+	mblk_t			*mp = NULL;
+	mblk_t			*tail = NULL;
+	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
+	mac_client_impl_t	*mcip;
+
+	mcip = mac_srs->srs_mcip;
+	ASSERT(mcip != NULL);
+
+	/*
+	 * We need to determine the SRS for every packet
+	 * by walking the flow table, if we don't get any,
+	 * then we proceed using the SRS we came with.
+	 */
+	mp = tail = mp_chain;
+	while (mp != NULL) {
+
+		/*
+		 * We will increment the stats for the mactching subflow.
+		 * when we get the bytes/pkt count for the classified packets
+		 * later in mac_rx_srs_process.
+		 */
+		(void) mac_flow_lookup(mcip->mci_subflow_tab, mp,
+		    FLOW_INBOUND, &flent);
+
+		if (mp == mp_chain || flent == prev_flent) {
+			if (prev_flent != NULL)
+				FLOW_REFRELE(prev_flent);
+			prev_flent = flent;
+			flent = NULL;
+			tail = mp;
+			mp = mp->b_next;
+			continue;
+		}
+		tail->b_next = NULL;
+		/*
+		 * A null indicates, this is for the mac_srs itself.
+		 * XXX-venu : probably assert for fe_rx_srs_cnt == 0.
+		 */
+		if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
+			mac_rx_srs_process(arg,
+			    (mac_resource_handle_t)mac_srs, mp_chain,
+			    loopback);
+		} else {
+			(prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
+			    prev_flent->fe_cb_arg2, mp_chain, loopback);
+			FLOW_REFRELE(prev_flent);
+		}
+		prev_flent = flent;
+		flent = NULL;
+		mp_chain = mp;
+		tail = mp;
+		mp = mp->b_next;
+	}
+	/* Last chain */
+	ASSERT(mp_chain != NULL);
+	if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
+		mac_rx_srs_process(arg,
+		    (mac_resource_handle_t)mac_srs, mp_chain, loopback);
+	} else {
+		(prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
+		    prev_flent->fe_cb_arg2, mp_chain, loopback);
+		FLOW_REFRELE(prev_flent);
+	}
+}
+
+/*
+ * mac_rx_srs_process
+ *
+ * Receive side routine called from the interrupt path.
+ *
+ * loopback is set to force a context switch on the loopback
+ * path between MAC clients.
+ */
+/* ARGSUSED */
+void
+mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
+    boolean_t loopback)
+{
+	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
+	mblk_t			*mp, *tail, *head;
+	int			count = 0;
+	int			count1;
+	size_t			sz = 0;
+	size_t			chain_sz, sz1;
+	mac_bw_ctl_t		*mac_bw;
+	mac_client_impl_t	*smcip;
+	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
+
+	/*
+	 * Set the tail, count and sz. We set the sz irrespective
+	 * of whether we are doing B/W control or not for the
+	 * purpose of updating the stats.
+	 */
+	mp = tail = mp_chain;
+	while (mp != NULL) {
+		tail = mp;
+		count++;
+		sz += msgdsize(mp);
+		mp = mp->b_next;
+	}
+
+	mutex_enter(&mac_srs->srs_lock);
+	smcip = mac_srs->srs_mcip;
+
+	if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) {
+		FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz);
+		FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count);
+	}
+	if (smcip != NULL) {
+		smcip->mci_stat_ibytes += sz;
+		smcip->mci_stat_ipackets += count;
+	}
+
+	/*
+	 * If the SRS in already being processed; has been blanked;
+	 * can be processed by worker thread only; or the B/W limit
+	 * has been reached, then queue the chain and check if
+	 * worker thread needs to be awakend.
+	 */
+	if (mac_srs->srs_type & SRST_BW_CONTROL) {
+		mac_bw = mac_srs->srs_bw;
+		ASSERT(mac_bw != NULL);
+		mutex_enter(&mac_bw->mac_bw_lock);
+		/* Count the packets and bytes via interrupt */
+		srs_rx->sr_intr_count += count;
+		mac_bw->mac_bw_intr += sz;
+		if (mac_bw->mac_bw_limit == 0) {
+			/* zero bandwidth: drop all */
+			srs_rx->sr_drop_count += count;
+			mac_bw->mac_bw_drop_bytes += sz;
+			mutex_exit(&mac_bw->mac_bw_lock);
+			mutex_exit(&mac_srs->srs_lock);
+			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+			return;
+		} else {
+			if ((mac_bw->mac_bw_sz + sz) <=
+			    mac_bw->mac_bw_drop_threshold) {
+				mutex_exit(&mac_bw->mac_bw_lock);
+				MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain,
+				    tail, count, sz);
+			} else {
+				mp = mp_chain;
+				chain_sz = 0;
+				count1 = 0;
+				tail = NULL;
+				head = NULL;
+				while (mp != NULL) {
+					sz1 = msgdsize(mp);
+					if (mac_bw->mac_bw_sz + chain_sz + sz1 >
+					    mac_bw->mac_bw_drop_threshold)
+						break;
+					chain_sz += sz1;
+					count1++;
+					tail = mp;
+					mp = mp->b_next;
+				}
+				mutex_exit(&mac_bw->mac_bw_lock);
+				if (tail != NULL) {
+					head = tail->b_next;
+					tail->b_next = NULL;
+					MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs,
+					    mp_chain, tail, count1, chain_sz);
+					sz -= chain_sz;
+					count -= count1;
+				} else {
+					/* Can't pick up any */
+					head = mp_chain;
+				}
+				if (head != NULL) {
+					/* Drop any packet over the threshold */
+					srs_rx->sr_drop_count += count;
+					mutex_enter(&mac_bw->mac_bw_lock);
+					mac_bw->mac_bw_drop_bytes += sz;
+					mutex_exit(&mac_bw->mac_bw_lock);
+					freemsgchain(head);
+				}
+			}
+			MAC_SRS_WORKER_WAKEUP(mac_srs);
+			mutex_exit(&mac_srs->srs_lock);
+			return;
+		}
+	}
+
+	/*
+	 * If the total number of packets queued in the SRS and
+	 * its associated soft rings exceeds the max allowed,
+	 * then drop the chain. If we are polling capable, this
+	 * shouldn't be happening.
+	 */
+	if (!(mac_srs->srs_type & SRST_BW_CONTROL) &&
+	    (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) {
+		mac_bw = mac_srs->srs_bw;
+		srs_rx->sr_drop_count += count;
+		mutex_enter(&mac_bw->mac_bw_lock);
+		mac_bw->mac_bw_drop_bytes += sz;
+		mutex_exit(&mac_bw->mac_bw_lock);
+		freemsgchain(mp_chain);
+		mutex_exit(&mac_srs->srs_lock);
+		return;
+	}
+
+	MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz);
+	/* Count the packets entering via interrupt path */
+	srs_rx->sr_intr_count += count;
+
+	if (!(mac_srs->srs_state & SRS_PROC)) {
+		/*
+		 * If we are coming via loopback or if we are not
+		 * optimizing for latency, we should signal the
+		 * worker thread.
+		 */
+		if (loopback || ((count > 1) &&
+		    !(mac_srs->srs_state & SRS_LATENCY_OPT))) {
+			/*
+			 * For loopback, We need to let the worker take
+			 * over as we don't want to continue in the same
+			 * thread even if we can. This could lead to stack
+			 * overflows and may also end up using
+			 * resources (cpu) incorrectly.
+			 */
+			cv_signal(&mac_srs->srs_async);
+		} else {
+			/*
+			 * Seems like no one is processing the SRS and
+			 * there is no backlog. We also inline process
+			 * our packet if its a single packet in non
+			 * latency optimized case (in latency optimized
+			 * case, we inline process chains of any size).
+			 */
+			mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST);
+		}
+	}
+	mutex_exit(&mac_srs->srs_lock);
+}
+
+/* TX SIDE ROUTINES (RUNTIME) */
+
+/*
+ * mac_tx_srs_no_desc
+ *
+ * This routine is called by Tx single ring default mode
+ * when Tx ring runs out of descs.
+ */
+mac_tx_cookie_t
+mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
+    uint16_t flag, mblk_t **ret_mp)
+{
+	mac_tx_cookie_t cookie = NULL;
+	mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
+	boolean_t wakeup_worker = B_TRUE;
+	uint32_t tx_mode = srs_tx->st_mode;
+	int cnt, sz;
+	mblk_t *tail;
+
+	ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
+	if (flag & MAC_DROP_ON_NO_DESC) {
+		MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+	} else {
+		if (mac_srs->srs_first != NULL)
+			wakeup_worker = B_FALSE;
+		MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
+		if (flag & MAC_TX_NO_ENQUEUE) {
+			/*
+			 * If TX_QUEUED is not set, queue the
+			 * packet and let mac_tx_srs_drain()
+			 * set the TX_BLOCKED bit for the
+			 * reasons explained above. Otherwise,
+			 * return the mblks.
+			 */
+			if (wakeup_worker) {
+				MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
+				    mp_chain, tail, cnt, sz);
+			} else {
+				MAC_TX_SET_NO_ENQUEUE(mac_srs,
+				    mp_chain, ret_mp, cookie);
+			}
+		} else {
+			MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
+			    tail, cnt, sz, cookie);
+		}
+		if (wakeup_worker)
+			cv_signal(&mac_srs->srs_async);
+	}
+	return (cookie);
+}
+
+/*
+ * mac_tx_srs_enqueue
+ *
+ * This routine is called when Tx SRS is operating in either serializer
+ * or bandwidth mode. In serializer mode, a packet will get enqueued
+ * when a thread cannot enter SRS exclusively. In bandwidth mode,
+ * packets gets queued if allowed byte-count limit for a tick is
+ * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and
+ * MAC_TX_NO_ENQUEUE is set is different than when operaing in either
+ * the default mode or fanout mode. Here packets get dropped or
+ * returned back to the caller only after hi-watermark worth of data
+ * is queued.
+ */
+static mac_tx_cookie_t
+mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
+    uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp)
+{
+	mac_tx_cookie_t cookie = NULL;
+	int cnt, sz;
+	mblk_t *tail;
+	boolean_t wakeup_worker = B_TRUE;
+
+	if (mac_srs->srs_first != NULL)
+		wakeup_worker = B_FALSE;
+	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
+	if (flag & MAC_DROP_ON_NO_DESC) {
+		if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
+			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+		} else {
+			MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
+			    mp_chain, tail, cnt, sz);
+		}
+	} else if (flag & MAC_TX_NO_ENQUEUE) {
+		if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) ||
+		    (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) {
+			MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain,
+			    ret_mp, cookie);
+		} else {
+			mp_chain->b_prev = (mblk_t *)fanout_hint;
+			MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
+			    mp_chain, tail, cnt, sz);
+		}
+	} else {
+		/*
+		 * If you are BW_ENFORCED, just enqueue the
+		 * packet. srs_worker will drain it at the
+		 * prescribed rate. Before enqueueing, save
+		 * the fanout hint.
+		 */
+		mp_chain->b_prev = (mblk_t *)fanout_hint;
+		MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
+		    tail, cnt, sz, cookie);
+	}
+	if (wakeup_worker)
+		cv_signal(&mac_srs->srs_async);
+	return (cookie);
+}
+
+/*
+ * There are five tx modes:
+ *
+ * 1) Default mode (SRS_TX_DEFAULT)
+ * 2) Serialization mode (SRS_TX_SERIALIZE)
+ * 3) Fanout mode (SRS_TX_FANOUT)
+ * 4) Bandwdith mode (SRS_TX_BW)
+ * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT)
+ *
+ * The tx mode in which an SRS operates is decided in mac_tx_srs_setup()
+ * based on the number of Tx rings requested for an SRS and whether
+ * bandwidth control is requested or not.
+ *
+ * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a
+ * pass-thru. Packets will go directly to mac_tx_send(). When the underlying
+ * Tx ring runs out of Tx descs, it starts queueing up packets in SRS.
+ * When flow-control is relieved, the srs_worker drains the queued
+ * packets and informs blocked clients to restart sending packets.
+ *
+ * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized.
+ *
+ * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple
+ * Tx rings. Each Tx ring will have a soft ring associated with it.
+ * These soft rings will be hung off the Tx SRS. Queueing if it happens
+ * due to lack of Tx desc will be in individual soft ring (and not srs)
+ * associated with Tx ring.
+ *
+ * In the TX_BW mode, tx srs will allow packets to go down to Tx ring
+ * only if bw is available. Otherwise the packets will be queued in
+ * SRS. If fanout to multiple Tx rings is configured, the packets will
+ * be fanned out among the soft rings associated with the Tx rings.
+ *
+ * Four flags are used in srs_state for indicating flow control
+ * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT.
+ * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the
+ * driver below.
+ * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat
+ * and flow-control pressure is applied back to clients. The clients expect
+ * wakeup when flow-control is relieved.
+ * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk
+ * got returned back to client either due to lack of Tx descs or due to bw
+ * control reasons. The clients expect a wakeup when condition is relieved.
+ *
+ * The fourth argument to mac_tx() is the flag. Normally it will be 0 but
+ * some clients set the following values too: MAC_DROP_ON_NO_DESC,
+ * MAC_TX_NO_ENQUEUE
+ * Mac clients that do not want packets to be enqueued in the mac layer set
+ * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or
+ * Tx soft rings but instead get dropped when the NIC runs out of desc. The
+ * behaviour of this flag is different when the Tx is running in serializer
+ * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet
+ * get dropped when Tx high watermark is reached.
+ * There are some mac clients like vsw, aggr that want the mblks to be
+ * returned back to clients instead of being queued in Tx SRS (or Tx soft
+ * rings) under flow-control (i.e., out of desc or exceeding bw limits)
+ * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set.
+ * In the default and Tx fanout mode, the un-transmitted mblks will be
+ * returned back to the clients when the driver runs out of Tx descs.
+ * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or
+ * soft ring) so that the clients can be woken up when Tx desc become
+ * available. When running in serializer or bandwidth mode mode,
+ * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached.
+ */
+
+mac_tx_func_t
+mac_tx_get_func(uint32_t mode)
+{
+	return (mac_tx_mode_list[mode].mac_tx_func);
+}
+
+/* ARGSUSED */
+static mac_tx_cookie_t
+mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
+    uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
+{
+	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
+	boolean_t		is_subflow;
+	mac_tx_stats_t		stats;
+	mac_tx_cookie_t		cookie = NULL;
+
+	ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT);
+
+	/* Regular case with a single Tx ring */
+	/*
+	 * SRS_TX_BLOCKED is set when underlying NIC runs
+	 * out of Tx descs and messages start getting
+	 * queued. It won't get reset until
+	 * tx_srs_drain() completely drains out the
+	 * messages.
+	 */
+	if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
+		/* Tx descs/resources not available */
+		mutex_enter(&mac_srs->srs_lock);
+		if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
+			cookie = mac_tx_srs_no_desc(mac_srs, mp_chain,
+			    flag, ret_mp);
+			mutex_exit(&mac_srs->srs_lock);
+			return (cookie);
+		}
+		/*
+		 * While we were computing mblk count, the
+		 * flow control condition got relieved.
+		 * Continue with the transmission.
+		 */
+		mutex_exit(&mac_srs->srs_lock);
+	}
+
+	is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
+
+	mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
+	    mp_chain, (is_subflow ? &stats : NULL));
+
+	/*
+	 * Multiple threads could be here sending packets.
+	 * Under such conditions, it is not possible to
+	 * automically set SRS_TX_BLOCKED bit to indicate
+	 * out of tx desc condition. To atomically set
+	 * this, we queue the returned packet and do
+	 * the setting of SRS_TX_BLOCKED in
+	 * mac_tx_srs_drain().
+	 */
+	if (mp_chain != NULL) {
+		mutex_enter(&mac_srs->srs_lock);
+		cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp);
+		mutex_exit(&mac_srs->srs_lock);
+		return (cookie);
+	}
+
+	if (is_subflow)
+		FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
+
+	return (NULL);
+}
+
+/*
+ * mac_tx_serialize_mode
+ *
+ * This is an experimental mode implemented as per the request of PAE.
+ * In this mode, all callers attempting to send a packet to the NIC
+ * will get serialized. Only one thread at any time will access the
+ * NIC to send the packet out.
+ */
+/* ARGSUSED */
+static mac_tx_cookie_t
+mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
+    uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
+{
+	boolean_t		is_subflow;
+	mac_tx_stats_t		stats;
+	mac_tx_cookie_t		cookie = NULL;
+	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
+
+	/* Single ring, serialize below */
+	ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE);
+	mutex_enter(&mac_srs->srs_lock);
+	if ((mac_srs->srs_first != NULL) ||
+	    (mac_srs->srs_state & SRS_PROC)) {
+		/*
+		 * In serialization mode, queue all packets until
+		 * TX_HIWAT is set.
+		 * If drop bit is set, drop if TX_HIWAT is set.
+		 * If no_enqueue is set, still enqueue until hiwat
+		 * is set and return mblks after TX_HIWAT is set.
+		 */
+		cookie = mac_tx_srs_enqueue(mac_srs, mp_chain,
+		    flag, NULL, ret_mp);
+		mutex_exit(&mac_srs->srs_lock);
+		return (cookie);
+	}
+	/*
+	 * No packets queued, nothing on proc and no flow
+	 * control condition. Fast-path, ok. Do inline
+	 * processing.
+	 */
+	mac_srs->srs_state |= SRS_PROC;
+	mutex_exit(&mac_srs->srs_lock);
+
+	is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
+
+	mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
+	    mp_chain, (is_subflow ? &stats : NULL));
+
+	mutex_enter(&mac_srs->srs_lock);
+	mac_srs->srs_state &= ~SRS_PROC;
+	if (mp_chain != NULL) {
+		cookie = mac_tx_srs_enqueue(mac_srs,
+		    mp_chain, flag, NULL, ret_mp);
+	}
+	if (mac_srs->srs_first != NULL) {
+		/*
+		 * We processed inline our packet and a new
+		 * packet/s got queued while we were
+		 * processing. Wakeup srs worker
+		 */
+		cv_signal(&mac_srs->srs_async);
+	}
+	mutex_exit(&mac_srs->srs_lock);
+
+	if (is_subflow && cookie == NULL)
+		FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
+
+	return (cookie);
+}
+
+/*
+ * mac_tx_fanout_mode
+ *
+ * In this mode, the SRS will have access to multiple Tx rings to send
+ * the packet out. The fanout hint that is passed as an argument is
+ * used to find an appropriate ring to fanout the traffic. Each Tx
+ * ring, in turn,  will have a soft ring associated with it. If a Tx
+ * ring runs out of Tx desc's the returned packet will be queued in
+ * the soft ring associated with that Tx ring. The srs itself will not
+ * queue any packets.
+ */
+static mac_tx_cookie_t
+mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
+    uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
+{
+	mac_soft_ring_t		*softring;
+	uint_t			indx, hash;
+
+	ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT);
+	hash = HASH_HINT(fanout_hint);
+	indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
+	softring = mac_srs->srs_oth_soft_rings[indx];
+	return (mac_tx_soft_ring_process(softring, mp_chain, flag, ret_mp));
+}
+
+/*
+ * mac_tx_bw_mode
+ *
+ * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring
+ * only if bw is available. Otherwise the packets will be queued in
+ * SRS. If the SRS has multiple Tx rings, then packets will get fanned
+ * out to a Tx rings.
+ */
+static mac_tx_cookie_t
+mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
+    uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
+{
+	int			cnt, sz;
+	mblk_t			*tail;
+	mac_tx_cookie_t		cookie = NULL;
+	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
+
+	ASSERT(TX_BANDWIDTH_MODE(mac_srs));
+	ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
+	mutex_enter(&mac_srs->srs_lock);
+	if (mac_srs->srs_bw->mac_bw_limit == 0) {
+		/* zero bandwidth: drop all */
+		MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+		mutex_exit(&mac_srs->srs_lock);
+		return (cookie);
+	} else if ((mac_srs->srs_first != NULL) ||
+	    (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
+		cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
+		    fanout_hint, ret_mp);
+		mutex_exit(&mac_srs->srs_lock);
+		return (cookie);
+	}
+	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
+	if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
+		mac_srs->srs_bw->mac_bw_curr_time = lbolt;
+		mac_srs->srs_bw->mac_bw_used = 0;
+	} else if (mac_srs->srs_bw->mac_bw_used >
+	    mac_srs->srs_bw->mac_bw_limit) {
+		mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
+		MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
+		    mp_chain, tail, cnt, sz);
+		/*
+		 * Wakeup worker thread. Note that worker
+		 * thread has to be woken up so that it
+		 * can fire up the timer to be woken up
+		 * on the next tick. Also once
+		 * BW_ENFORCED is set, it can only be
+		 * reset by srs_worker thread. Until then
+		 * all packets will get queued up in SRS
+		 * and hence this this code path won't be
+		 * entered until BW_ENFORCED is reset.
+		 */
+		cv_signal(&mac_srs->srs_async);
+		mutex_exit(&mac_srs->srs_lock);
+		return (cookie);
+	}
+
+	mac_srs->srs_bw->mac_bw_used += sz;
+	mutex_exit(&mac_srs->srs_lock);
+
+	if (srs_tx->st_mode == SRS_TX_BW_FANOUT) {
+		mac_soft_ring_t *softring;
+		uint_t indx, hash;
+
+		hash = HASH_HINT(fanout_hint);
+		indx = COMPUTE_INDEX(hash,
+		    mac_srs->srs_oth_ring_count);
+		softring = mac_srs->srs_oth_soft_rings[indx];
+		return (mac_tx_soft_ring_process(softring, mp_chain, flag,
+		    ret_mp));
+	} else {
+		boolean_t		is_subflow;
+		mac_tx_stats_t		stats;
+
+		is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
+
+		mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
+		    mp_chain, (is_subflow ? &stats : NULL));
+
+		if (mp_chain != NULL) {
+			mutex_enter(&mac_srs->srs_lock);
+			MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
+			if (mac_srs->srs_bw->mac_bw_used > sz)
+				mac_srs->srs_bw->mac_bw_used -= sz;
+			else
+				mac_srs->srs_bw->mac_bw_used = 0;
+			cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
+			    fanout_hint, ret_mp);
+			mutex_exit(&mac_srs->srs_lock);
+			return (cookie);
+		}
+		if (is_subflow)
+			FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
+
+		return (NULL);
+	}
+}
+
+/* ARGSUSED */
+void
+mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
+{
+	mblk_t			*head, *tail;
+	size_t			sz;
+	uint32_t		tx_mode;
+	uint_t			saved_pkt_count;
+	boolean_t		is_subflow;
+	mac_tx_stats_t		stats;
+	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
+
+	saved_pkt_count = 0;
+	ASSERT(mutex_owned(&mac_srs->srs_lock));
+	ASSERT(!(mac_srs->srs_state & SRS_PROC));
+
+	mac_srs->srs_state |= SRS_PROC;
+
+	is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
+	tx_mode = srs_tx->st_mode;
+	if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) {
+		if (mac_srs->srs_first != NULL) {
+			head = mac_srs->srs_first;
+			tail = mac_srs->srs_last;
+			saved_pkt_count = mac_srs->srs_count;
+			mac_srs->srs_first = NULL;
+			mac_srs->srs_last = NULL;
+			mac_srs->srs_count = 0;
+			mutex_exit(&mac_srs->srs_lock);
+
+			head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
+			    head, &stats);
+
+			mutex_enter(&mac_srs->srs_lock);
+			if (head != NULL) {
+				/* Device out of tx desc, set block */
+				if (head->b_next == NULL)
+					VERIFY(head == tail);
+				tail->b_next = mac_srs->srs_first;
+				mac_srs->srs_first = head;
+				mac_srs->srs_count +=
+				    (saved_pkt_count - stats.ts_opackets);
+				if (mac_srs->srs_last == NULL)
+					mac_srs->srs_last = tail;
+				MAC_TX_SRS_BLOCK(mac_srs, head);
+			} else {
+				srs_tx->st_woken_up = B_FALSE;
+				if (is_subflow) {
+					FLOW_TX_STATS_UPDATE(
+					    mac_srs->srs_flent, &stats);
+				}
+			}
+		}
+	} else if (tx_mode == SRS_TX_BW) {
+		/*
+		 * We are here because the timer fired and we have some data
+		 * to tranmit. Also mac_tx_srs_worker should have reset
+		 * SRS_BW_ENFORCED flag
+		 */
+		ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED));
+		head = tail = mac_srs->srs_first;
+		while (mac_srs->srs_first != NULL) {
+			tail = mac_srs->srs_first;
+			tail->b_prev = NULL;
+			mac_srs->srs_first = tail->b_next;
+			if (mac_srs->srs_first == NULL)
+				mac_srs->srs_last = NULL;
+			mac_srs->srs_count--;
+			sz = msgdsize(tail);
+			mac_srs->srs_size -= sz;
+			saved_pkt_count++;
+			MAC_TX_UPDATE_BW_INFO(mac_srs, sz);
+
+			if (mac_srs->srs_bw->mac_bw_used <
+			    mac_srs->srs_bw->mac_bw_limit)
+				continue;
+
+			if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
+				mac_srs->srs_bw->mac_bw_curr_time = lbolt;
+				mac_srs->srs_bw->mac_bw_used = sz;
+				continue;
+			}
+			mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
+			break;
+		}
+
+		ASSERT((head == NULL && tail == NULL) ||
+		    (head != NULL && tail != NULL));
+		if (tail != NULL) {
+			tail->b_next = NULL;
+			mutex_exit(&mac_srs->srs_lock);
+
+			head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
+			    head, &stats);
+
+			mutex_enter(&mac_srs->srs_lock);
+			if (head != NULL) {
+				uint_t size_sent;
+
+				/* Device out of tx desc, set block */
+				if (head->b_next == NULL)
+					VERIFY(head == tail);
+				tail->b_next = mac_srs->srs_first;
+				mac_srs->srs_first = head;
+				mac_srs->srs_count +=
+				    (saved_pkt_count - stats.ts_opackets);
+				if (mac_srs->srs_last == NULL)
+					mac_srs->srs_last = tail;
+				size_sent = sz - stats.ts_obytes;
+				mac_srs->srs_size += size_sent;
+				mac_srs->srs_bw->mac_bw_sz += size_sent;
+				if (mac_srs->srs_bw->mac_bw_used > size_sent) {
+					mac_srs->srs_bw->mac_bw_used -=
+					    size_sent;
+				} else {
+					mac_srs->srs_bw->mac_bw_used = 0;
+				}
+				MAC_TX_SRS_BLOCK(mac_srs, head);
+			} else {
+				srs_tx->st_woken_up = B_FALSE;
+				if (is_subflow) {
+					FLOW_TX_STATS_UPDATE(
+					    mac_srs->srs_flent, &stats);
+				}
+			}
+		}
+	} else if (tx_mode == SRS_TX_BW_FANOUT) {
+		mblk_t *prev;
+		mac_soft_ring_t *softring;
+		uint64_t hint;
+
+		/*
+		 * We are here because the timer fired and we
+		 * have some quota to tranmit.
+		 */
+		prev = NULL;
+		head = tail = mac_srs->srs_first;
+		while (mac_srs->srs_first != NULL) {
+			tail = mac_srs->srs_first;
+			mac_srs->srs_first = tail->b_next;
+			if (mac_srs->srs_first == NULL)
+				mac_srs->srs_last = NULL;
+			mac_srs->srs_count--;
+			sz = msgdsize(tail);
+			mac_srs->srs_size -= sz;
+			mac_srs->srs_bw->mac_bw_used += sz;
+			if (prev == NULL)
+				hint = (ulong_t)tail->b_prev;
+			if (hint != (ulong_t)tail->b_prev) {
+				prev->b_next = NULL;
+				mutex_exit(&mac_srs->srs_lock);
+				TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
+				head = tail;
+				hint = (ulong_t)tail->b_prev;
+				mutex_enter(&mac_srs->srs_lock);
+			}
+
+			prev = tail;
+			tail->b_prev = NULL;
+			if (mac_srs->srs_bw->mac_bw_used <
+			    mac_srs->srs_bw->mac_bw_limit)
+				continue;
+
+			if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
+				mac_srs->srs_bw->mac_bw_curr_time = lbolt;
+				mac_srs->srs_bw->mac_bw_used = 0;
+				continue;
+			}
+			mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
+			break;
+		}
+		ASSERT((head == NULL && tail == NULL) ||
+		    (head != NULL && tail != NULL));
+		if (tail != NULL) {
+			tail->b_next = NULL;
+			mutex_exit(&mac_srs->srs_lock);
+			TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
+			mutex_enter(&mac_srs->srs_lock);
+		}
+	}
+	/*
+	 * SRS_TX_FANOUT case not considered here because packets
+	 * won't be queued in the SRS for this case. Packets will
+	 * be sent directly to soft rings underneath and if there
+	 * is any queueing at all, it would be in Tx side soft
+	 * rings.
+	 */
+
+	/*
+	 * When srs_count becomes 0, reset SRS_TX_HIWAT and
+	 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients.
+	 */
+	if (mac_srs->srs_count == 0 && (mac_srs->srs_state &
+	    (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) {
+		mac_tx_notify_cb_t *mtnfp;
+		mac_cb_t *mcb;
+		mac_client_impl_t *mcip = mac_srs->srs_mcip;
+		boolean_t wakeup_required = B_FALSE;
+
+		if (mac_srs->srs_state &
+		    (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) {
+			wakeup_required = B_TRUE;
+		}
+		mac_srs->srs_state &= ~(SRS_TX_HIWAT |
+		    SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED);
+		mutex_exit(&mac_srs->srs_lock);
+		if (wakeup_required) {
+			/* Wakeup callback registered clients */
+			MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
+			for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
+			    mcb = mcb->mcb_nextp) {
+				mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
+				mtnfp->mtnf_fn(mtnfp->mtnf_arg,
+				    (mac_tx_cookie_t)mac_srs);
+			}
+			MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
+			    &mcip->mci_tx_notify_cb_list);
+			/*
+			 * If the client is not the primary MAC client, then we
+			 * need to send the notification to the clients upper
+			 * MAC, i.e. mci_upper_mip.
+			 */
+			mac_tx_notify(mcip->mci_upper_mip != NULL ?
+			    mcip->mci_upper_mip : mcip->mci_mip);
+		}
+		mutex_enter(&mac_srs->srs_lock);
+	}
+	mac_srs->srs_state &= ~SRS_PROC;
+}
+
+/*
+ * Given a packet, get the flow_entry that identifies the flow
+ * to which that packet belongs. The flow_entry will contain
+ * the transmit function to be used to send the packet. If the
+ * function returns NULL, the packet should be sent using the
+ * underlying NIC.
+ */
+static flow_entry_t *
+mac_tx_classify(mac_impl_t *mip, mblk_t *mp)
+{
+	flow_entry_t		*flent = NULL;
+	mac_client_impl_t	*mcip;
+	int	err;
+
+	/*
+	 * Do classification on the packet.
+	 */
+	err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent);
+	if (err != 0)
+		return (NULL);
+
+	/*
+	 * This flent might just be an additional one on the MAC client,
+	 * i.e. for classification purposes (different fdesc), however
+	 * the resources, SRS et. al., are in the mci_flent, so if
+	 * this isn't the mci_flent, we need to get it.
+	 */
+	if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) {
+		FLOW_REFRELE(flent);
+		flent = mcip->mci_flent;
+		FLOW_TRY_REFHOLD(flent, err);
+		if (err != 0)
+			return (NULL);
+	}
+
+	return (flent);
+}
+
+/*
+ * This macro is only meant to be used by mac_tx_send().
+ */
+#define	CHECK_VID_AND_ADD_TAG(mp) {			\
+	if (vid_check) {				\
+		int err = 0;				\
+							\
+		MAC_VID_CHECK(src_mcip, (mp), err);	\
+		if (err != 0) {				\
+			freemsg((mp));			\
+			(mp) = next;			\
+			oerrors++;			\
+			continue;			\
+		}					\
+	}						\
+	if (add_tag) {					\
+		(mp) = mac_add_vlan_tag((mp), 0, vid);	\
+		if ((mp) == NULL) {			\
+			(mp) = next;			\
+			oerrors++;			\
+			continue;			\
+		}					\
+	}						\
+}
+
+mblk_t *
+mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
+    mac_tx_stats_t *stats)
+{
+	mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch;
+	mac_impl_t *mip = src_mcip->mci_mip;
+	uint_t obytes = 0, opackets = 0, oerrors = 0;
+	mblk_t *mp = NULL, *next;
+	boolean_t vid_check, add_tag;
+	uint16_t vid = 0;
+
+	if (mip->mi_nclients > 1) {
+		vid_check = MAC_VID_CHECK_NEEDED(src_mcip);
+		add_tag = MAC_TAG_NEEDED(src_mcip);
+		if (add_tag)
+			vid = mac_client_vid(mch);
+	} else {
+		ASSERT(mip->mi_nclients == 1);
+		vid_check = add_tag = B_FALSE;
+	}
+
+	/*
+	 * Fastpath: if there's only one client, and there's no
+	 * multicast listeners, we simply send the packet down to the
+	 * underlying NIC.
+	 */
+	if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL)  {
+		DTRACE_PROBE2(fastpath,
+		    mac_client_impl_t *, src_mcip, mblk_t *, mp_chain);
+
+		mp = mp_chain;
+		while (mp != NULL) {
+			next = mp->b_next;
+			mp->b_next = NULL;
+			opackets++;
+			obytes += (mp->b_cont == NULL ? MBLKL(mp) :
+			    msgdsize(mp));
+
+			CHECK_VID_AND_ADD_TAG(mp);
+			MAC_TX(mip, ring, mp, src_mcip);
+
+			/*
+			 * If the driver is out of descriptors and does a
+			 * partial send it will return a chain of unsent
+			 * mblks. Adjust the accounting stats.
+			 */
+			if (mp != NULL) {
+				opackets--;
+				obytes -= msgdsize(mp);
+				mp->b_next = next;
+				break;
+			}
+			mp = next;
+		}
+		goto done;
+	}
+
+	/*
+	 * No fastpath, we either have more than one MAC client
+	 * defined on top of the same MAC, or one or more MAC
+	 * client promiscuous callbacks.
+	 */
+	DTRACE_PROBE3(slowpath, mac_client_impl_t *,
+	    src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
+
+	if (mip->mi_promisc_list != NULL)
+		mac_promisc_dispatch(mip, mp_chain, src_mcip);
+
+	mp = mp_chain;
+	while (mp != NULL) {
+		flow_entry_t *dst_flow_ent;
+		void *flow_cookie;
+		size_t	pkt_size;
+		mblk_t *mp1;
+
+		next = mp->b_next;
+		mp->b_next = NULL;
+		opackets++;
+		pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp));
+		obytes += pkt_size;
+		CHECK_VID_AND_ADD_TAG(mp);
+
+		/*
+		 * Find the destination.
+		 */
+		dst_flow_ent = mac_tx_classify(mip, mp);
+
+		if (dst_flow_ent != NULL) {
+			size_t	hdrsize;
+			int	err = 0;
+
+			if (mip->mi_info.mi_nativemedia == DL_ETHER) {
+				struct ether_vlan_header *evhp =
+				    (struct ether_vlan_header *)mp->b_rptr;
+
+				if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
+					hdrsize = sizeof (*evhp);
+				else
+					hdrsize = sizeof (struct ether_header);
+			} else {
+				mac_header_info_t	mhi;
+
+				err = mac_header_info((mac_handle_t)mip,
+				    mp, &mhi);
+				if (err == 0)
+					hdrsize = mhi.mhi_hdrsize;
+			}
+
+			/*
+			 * Got a matching flow. It's either another
+			 * MAC client, or a broadcast/multicast flow.
+			 * Make sure the packet size is within the
+			 * allowed size. If not drop the packet and
+			 * move to next packet.
+			 */
+			if (err != 0 ||
+			    (pkt_size - hdrsize) > mip->mi_sdu_max) {
+				oerrors++;
+				DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
+				    mblk_t *, mp);
+				freemsg(mp);
+				mp = next;
+				FLOW_REFRELE(dst_flow_ent);
+				continue;
+			}
+			flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
+			if (flow_cookie != NULL) {
+				/*
+				 * The vnic_bcast_send function expects
+				 * to receive the sender MAC client
+				 * as value for arg2.
+				 */
+				mac_bcast_send(flow_cookie, src_mcip, mp,
+				    B_TRUE);
+			} else {
+				/*
+				 * loopback the packet to a
+				 * local MAC client. We force a context
+				 * switch if both source and destination
+				 * MAC clients are used by IP, i.e. bypass
+				 * is set.
+				 */
+				boolean_t do_switch;
+				mac_client_impl_t *dst_mcip =
+				    dst_flow_ent->fe_mcip;
+
+				do_switch = ((src_mcip->mci_state_flags &
+				    dst_mcip->mci_state_flags &
+				    MCIS_CLIENT_POLL_CAPABLE) != 0);
+
+				if ((mp1 = mac_fix_cksum(mp)) != NULL) {
+					(dst_flow_ent->fe_cb_fn)(
+					    dst_flow_ent->fe_cb_arg1,
+					    dst_flow_ent->fe_cb_arg2,
+					    mp1, do_switch);
+				}
+			}
+			FLOW_REFRELE(dst_flow_ent);
+		} else {
+			/*
+			 * Unknown destination, send via the underlying
+			 * NIC.
+			 */
+			MAC_TX(mip, ring, mp, src_mcip);
+			if (mp != NULL) {
+				/*
+				 * Adjust for the last packet that
+				 * could not be transmitted
+				 */
+				opackets--;
+				obytes -= pkt_size;
+				mp->b_next = next;
+				break;
+			}
+		}
+		mp = next;
+	}
+
+done:
+	src_mcip->mci_stat_obytes += obytes;
+	src_mcip->mci_stat_opackets += opackets;
+	src_mcip->mci_stat_oerrors += oerrors;
+
+	if (stats != NULL) {
+		stats->ts_opackets = opackets;
+		stats->ts_obytes = obytes;
+		stats->ts_oerrors = oerrors;
+	}
+	return (mp);
+}
+
+/*
+ * mac_tx_srs_ring_present
+ *
+ * Returns whether the specified ring is part of the specified SRS.
+ */
+boolean_t
+mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
+{
+	int i;
+	mac_soft_ring_t *soft_ring;
+
+	if (srs->srs_tx.st_arg2 == tx_ring)
+		return (B_TRUE);
+
+	for (i = 0; i < srs->srs_oth_ring_count; i++) {
+		soft_ring =  srs->srs_oth_soft_rings[i];
+		if (soft_ring->s_ring_tx_arg2 == tx_ring)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * mac_tx_srs_wakeup
+ *
+ * Called when Tx desc become available. Wakeup the appropriate worker
+ * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the
+ * state field.
+ */
+void
+mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring)
+{
+	int i;
+	mac_soft_ring_t *sringp;
+	mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
+
+	mutex_enter(&mac_srs->srs_lock);
+	if (TX_SINGLE_RING_MODE(mac_srs)) {
+		if (srs_tx->st_arg2 == ring &&
+		    mac_srs->srs_state & SRS_TX_BLOCKED) {
+			mac_srs->srs_state &= ~SRS_TX_BLOCKED;
+			srs_tx->st_unblocked_cnt++;
+			cv_signal(&mac_srs->srs_async);
+		}
+		/*
+		 * A wakeup can come before tx_srs_drain() could
+		 * grab srs lock and set SRS_TX_BLOCKED. So
+		 * always set woken_up flag when we come here.
+		 */
+		srs_tx->st_woken_up = B_TRUE;
+		mutex_exit(&mac_srs->srs_lock);
+		return;
+	}
+
+	/* If you are here, it is for FANOUT or BW_FANOUT case */
+	ASSERT(TX_MULTI_RING_MODE(mac_srs));
+	for (i = 0; i < mac_srs->srs_oth_ring_count; i++) {
+		sringp = mac_srs->srs_oth_soft_rings[i];
+		mutex_enter(&sringp->s_ring_lock);
+		if (sringp->s_ring_tx_arg2 == ring) {
+			if (sringp->s_ring_state & S_RING_BLOCK) {
+				sringp->s_ring_state &= ~S_RING_BLOCK;
+				sringp->s_ring_unblocked_cnt++;
+				cv_signal(&sringp->s_ring_async);
+			}
+			sringp->s_ring_tx_woken_up = B_TRUE;
+		}
+		mutex_exit(&sringp->s_ring_lock);
+	}
+	mutex_exit(&mac_srs->srs_lock);
+}
+
+/*
+ * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash
+ * the blocked clients again.
+ */
+void
+mac_tx_notify(mac_impl_t *mip)
+{
+	i_mac_notify(mip, MAC_NOTE_TX);
+}
+
+/*
+ * RX SOFTRING RELATED FUNCTIONS
+ *
+ * These functions really belong in mac_soft_ring.c and here for
+ * a short period.
+ */
+
+#define	SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {	       	\
+	/*								\
+	 * Enqueue our mblk chain.					\
+	 */								\
+	ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock));			\
+									\
+	if ((ringp)->s_ring_last != NULL)				\
+		(ringp)->s_ring_last->b_next = (mp);			\
+	else								\
+		(ringp)->s_ring_first = (mp);				\
+	(ringp)->s_ring_last = (tail);					\
+	(ringp)->s_ring_count += (cnt);					\
+	ASSERT((ringp)->s_ring_count > 0);				\
+	if ((ringp)->s_ring_type & ST_RING_BW_CTL) {			\
+		(ringp)->s_ring_size += sz;				\
+	}								\
+}
+
+/*
+ * Default entry point to deliver a packet chain to a MAC client.
+ * If the MAC client has flows, do the classification with these
+ * flows as well.
+ */
+/* ARGSUSED */
+void
+mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
+    mac_header_info_t *arg3)
+{
+	mac_client_impl_t *mcip = arg1;
+
+	if (mcip->mci_nvids == 1 &&
+	    !(mcip->mci_state_flags & MCIS_TAG_DISABLE)) {
+		/*
+		 * If the client has exactly one VID associated with it
+		 * and striping of VLAN header is not disabled,
+		 * remove the VLAN tag from the packet before
+		 * passing it on to the client's receive callback.
+		 * Note that this needs to be done after we dispatch
+		 * the packet to the promiscuous listeners of the
+		 * client, since they expect to see the whole
+		 * frame including the VLAN headers.
+		 */
+		mp_chain = mac_strip_vlan_tag_chain(mp_chain);
+	}
+
+	mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
+}
+
+/*
+ * mac_rx_soft_ring_process
+ *
+ * process a chain for a given soft ring. The number of packets queued
+ * in the SRS and its associated soft rings (including this one) is
+ * very small (tracked by srs_poll_pkt_cnt), then allow the entering
+ * thread (interrupt or poll thread) to do inline processing. This
+ * helps keep the latency down under low load.
+ *
+ * The proc and arg for each mblk is already stored in the mblk in
+ * appropriate places.
+ */
+/* ARGSUSED */
+void
+mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
+    mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
+{
+	mac_direct_rx_t		proc;
+	void			*arg1;
+	mac_resource_handle_t	arg2;
+	mac_soft_ring_set_t	*mac_srs = ringp->s_ring_set;
+
+	ASSERT(ringp != NULL);
+	ASSERT(mp_chain != NULL);
+	ASSERT(tail != NULL);
+	ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
+
+	mutex_enter(&ringp->s_ring_lock);
+	ringp->s_ring_total_inpkt += cnt;
+	if ((ringp->s_ring_type & ST_RING_ANY) ||
+	    ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
+	    !mac_srs->srs_rx.sr_enqueue_always)) {
+		/* If on processor or blanking on, then enqueue and return */
+		if (ringp->s_ring_state & S_RING_BLANK ||
+		    ringp->s_ring_state & S_RING_PROC) {
+			SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
+			mutex_exit(&ringp->s_ring_lock);
+			return;
+		}
+
+		proc = ringp->s_ring_rx_func;
+		arg1 = ringp->s_ring_rx_arg1;
+		arg2 = ringp->s_ring_rx_arg2;
+		/*
+		 * See if anything is already queued. If we are the
+		 * first packet, do inline processing else queue the
+		 * packet and do the drain.
+		 */
+		if (ringp->s_ring_first == NULL) {
+			/*
+			 * Fast-path, ok to process and nothing queued.
+			 */
+			ringp->s_ring_run = curthread;
+			ringp->s_ring_state |= (S_RING_PROC);
+
+			mutex_exit(&ringp->s_ring_lock);
+
+			/*
+			 * We are the chain of 1 packet so
+			 * go through this fast path.
+			 */
+			ASSERT(mp_chain->b_next == NULL);
+
+			(*proc)(arg1, arg2, mp_chain, NULL);
+
+			ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
+			/*
+			 * If we have a soft ring set which is doing
+			 * bandwidth control, we need to decrement
+			 * srs_size and count so it the SRS can have a
+			 * accurate idea of what is the real data
+			 * queued between SRS and its soft rings. We
+			 * decrement the counters only when the packet
+			 * gets processed by both SRS and the soft ring.
+			 */
+			mutex_enter(&mac_srs->srs_lock);
+			MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
+			MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
+			mutex_exit(&mac_srs->srs_lock);
+
+			mutex_enter(&ringp->s_ring_lock);
+			ringp->s_ring_run = NULL;
+			ringp->s_ring_state &= ~S_RING_PROC;
+			if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
+				cv_signal(&ringp->s_ring_client_cv);
+
+			if ((ringp->s_ring_first == NULL) ||
+			    (ringp->s_ring_state & S_RING_BLANK)) {
+				/*
+				 * We processed inline our packet and
+				 * nothing new has arrived or our
+				 * receiver doesn't want to receive
+				 * any packets. We are done.
+				 */
+				mutex_exit(&ringp->s_ring_lock);
+				return;
+			}
+		} else {
+			SOFT_RING_ENQUEUE_CHAIN(ringp,
+			    mp_chain, tail, cnt, sz);
+		}
+
+		/*
+		 * We are here because either we couldn't do inline
+		 * processing (because something was already
+		 * queued), or we had a chain of more than one
+		 * packet, or something else arrived after we were
+		 * done with inline processing.
+		 */
+		ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
+		ASSERT(ringp->s_ring_first != NULL);
+
+		ringp->s_ring_drain_func(ringp);
+		mutex_exit(&ringp->s_ring_lock);
+		return;
+	} else {
+		/* ST_RING_WORKER_ONLY case */
+		SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
+		mac_soft_ring_worker_wakeup(ringp);
+		mutex_exit(&ringp->s_ring_lock);
+	}
+}
+
+/*
+ * TX SOFTRING RELATED FUNCTIONS
+ *
+ * These functions really belong in mac_soft_ring.c and here for
+ * a short period.
+ */
+
+#define	TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {	       	\
+	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));			\
+	ringp->s_ring_state |= S_RING_ENQUEUED;				\
+	SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);	\
+}
+
+/*
+ * mac_tx_sring_queued
+ *
+ * When we are out of transmit descriptors and we already have a
+ * queue that exceeds hiwat (or the client called us with
+ * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the
+ * soft ring pointer as the opaque cookie for the client enable
+ * flow control.
+ */
+static mac_tx_cookie_t
+mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
+    mblk_t **ret_mp)
+{
+	int cnt;
+	size_t sz;
+	mblk_t *tail;
+	mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
+	mac_tx_cookie_t cookie = NULL;
+	boolean_t wakeup_worker = B_TRUE;
+
+	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
+	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
+	if (flag & MAC_DROP_ON_NO_DESC) {
+		mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+		/* increment freed stats */
+		ringp->s_ring_drops += cnt;
+		cookie = (mac_tx_cookie_t)ringp;
+	} else {
+		if (ringp->s_ring_first != NULL)
+			wakeup_worker = B_FALSE;
+
+		if (flag & MAC_TX_NO_ENQUEUE) {
+			/*
+			 * If QUEUED is not set, queue the packet
+			 * and let mac_tx_soft_ring_drain() set
+			 * the TX_BLOCKED bit for the reasons
+			 * explained above. Otherwise, return the
+			 * mblks.
+			 */
+			if (wakeup_worker) {
+				TX_SOFT_RING_ENQUEUE_CHAIN(ringp,
+				    mp_chain, tail, cnt, sz);
+			} else {
+				ringp->s_ring_state |= S_RING_WAKEUP_CLIENT;
+				cookie = (mac_tx_cookie_t)ringp;
+				*ret_mp = mp_chain;
+			}
+		} else {
+			boolean_t enqueue = B_TRUE;
+
+			if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
+				/*
+				 * flow-controlled. Store ringp in cookie
+				 * so that it can be returned as
+				 * mac_tx_cookie_t to client
+				 */
+				ringp->s_ring_state |= S_RING_TX_HIWAT;
+				cookie = (mac_tx_cookie_t)ringp;
+				ringp->s_ring_hiwat_cnt++;
+				if (ringp->s_ring_count >
+				    ringp->s_ring_tx_max_q_cnt) {
+					/* increment freed stats */
+					ringp->s_ring_drops += cnt;
+					/*
+					 * b_prev may be set to the fanout hint
+					 * hence can't use freemsg directly
+					 */
+					mac_pkt_drop(NULL, NULL,
+					    mp_chain, B_FALSE);
+					DTRACE_PROBE1(tx_queued_hiwat,
+					    mac_soft_ring_t *, ringp);
+					enqueue = B_FALSE;
+				}
+			}
+			if (enqueue) {
+				TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain,
+				    tail, cnt, sz);
+			}
+		}
+		if (wakeup_worker)
+			cv_signal(&ringp->s_ring_async);
+	}
+	return (cookie);
+}
+
+
+/*
+ * mac_tx_soft_ring_process
+ *
+ * This routine is called when fanning out outgoing traffic among
+ * multipe Tx rings.
+ * Note that a soft ring is associated with a h/w Tx ring.
+ */
+mac_tx_cookie_t
+mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain,
+    uint16_t flag, mblk_t **ret_mp)
+{
+	mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
+	int	cnt;
+	size_t	sz;
+	mblk_t	*tail;
+	mac_tx_cookie_t cookie = NULL;
+
+	ASSERT(ringp != NULL);
+	ASSERT(mp_chain != NULL);
+	ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
+	/*
+	 * Only two modes can come here; either it can be
+	 * SRS_TX_BW_FANOUT or SRS_TX_FANOUT
+	 */
+	ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
+	    mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT);
+
+	if (ringp->s_ring_type & ST_RING_WORKER_ONLY) {
+		/* Serialization mode */
+
+		mutex_enter(&ringp->s_ring_lock);
+		if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
+			cookie = mac_tx_sring_enqueue(ringp, mp_chain,
+			    flag, ret_mp);
+			mutex_exit(&ringp->s_ring_lock);
+			return (cookie);
+		}
+		MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
+		TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
+		if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) {
+			/*
+			 * If ring is blocked due to lack of Tx
+			 * descs, just return. Worker thread
+			 * will get scheduled when Tx desc's
+			 * become available.
+			 */
+			mutex_exit(&ringp->s_ring_lock);
+			return (cookie);
+		}
+		mac_soft_ring_worker_wakeup(ringp);
+		mutex_exit(&ringp->s_ring_lock);
+		return (cookie);
+	} else {
+		/* Default fanout mode */
+		/*
+		 * S_RING_BLOCKED is set when underlying NIC runs
+		 * out of Tx descs and messages start getting
+		 * queued. It won't get reset until
+		 * tx_srs_drain() completely drains out the
+		 * messages.
+		 */
+		boolean_t		is_subflow;
+		mac_tx_stats_t		stats;
+
+		if (ringp->s_ring_state & S_RING_ENQUEUED) {
+			/* Tx descs/resources not available */
+			mutex_enter(&ringp->s_ring_lock);
+			if (ringp->s_ring_state & S_RING_ENQUEUED) {
+				cookie = mac_tx_sring_enqueue(ringp, mp_chain,
+				    flag, ret_mp);
+				mutex_exit(&ringp->s_ring_lock);
+				return (cookie);
+			}
+			/*
+			 * While we were computing mblk count, the
+			 * flow control condition got relieved.
+			 * Continue with the transmission.
+			 */
+			mutex_exit(&ringp->s_ring_lock);
+		}
+		is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
+
+		mp_chain = mac_tx_send(ringp->s_ring_tx_arg1,
+		    ringp->s_ring_tx_arg2, mp_chain,
+		    (is_subflow ? &stats : NULL));
+
+		/*
+		 * Multiple threads could be here sending packets.
+		 * Under such conditions, it is not possible to
+		 * automically set S_RING_BLOCKED bit to indicate
+		 * out of tx desc condition. To atomically set
+		 * this, we queue the returned packet and do
+		 * the setting of S_RING_BLOCKED in
+		 * mac_tx_soft_ring_drain().
+		 */
+		if (mp_chain != NULL) {
+			mutex_enter(&ringp->s_ring_lock);
+			cookie =
+			    mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp);
+			mutex_exit(&ringp->s_ring_lock);
+			return (cookie);
+		}
+		if (is_subflow) {
+			FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
+		}
+		return (NULL);
+	}
+}
diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c
new file mode 100644
index 0000000000..ff6991ada2
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_soft_ring.c
@@ -0,0 +1,732 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * General Soft rings - Simulating Rx rings in S/W.
+ *
+ * Soft ring is a data abstraction containing a queue and a worker
+ * thread and represents a hardware Rx ring in software. Each soft
+ * ring set can have a collection of soft rings for separating
+ * L3/L4 specific traffic (IPv4 from IPv6 or TCP from UDP) or for
+ * allowing a higher degree of parallelism by sending traffic to
+ * one of the soft rings for a SRS (using a hash on src IP or port).
+ * Each soft ring worker thread can be bound to a different CPU
+ * allowing the processing for each soft ring to happen in parallel
+ * and independent from each other.
+ *
+ * Protocol soft rings:
+ *
+ * Each SRS has at an minimum 3 softrings. One each for IPv4 TCP,
+ * IPv4 UDP and rest (OTH - for IPv6 and everything else). The
+ * SRS does dynamic polling and enforces link level bandwidth but
+ * it does so for all traffic (IPv4 and IPv6 and all protocols) on
+ * that link. However, each protocol layer wants a different
+ * behaviour. For instance IPv4 TCP has per CPU squeues which
+ * enforce their own polling and flow control so IPv4 TCP traffic
+ * needs to go to a separate soft ring which can be polled by the
+ * TCP squeue. It also allows TCP squeue to push back flow control
+ * all the way to NIC hardware (if it puts its corresponding soft
+ * ring in the poll mode and soft ring queue builds up, the
+ * shared srs_poll_pkt_cnt goes up and SRS automatically stops
+ * more packets from entering the system).
+ *
+ * Similarly, the UDP benefits from a DLS bypass and packet chaining
+ * so sending it to a separate soft ring is desired. All the rest of
+ * the traffic (including IPv6 is sent to OTH softring). The IPv6
+ * traffic current goes through OTH softring and via DLS because
+ * it need more processing to be done. Irrespective of the sap
+ * (IPv4 or IPv6) or the transport, the dynamic polling, B/W enforcement,
+ * cpu assignment, fanout, etc apply to all traffic since they
+ * are implement by the SRS which is agnostic to sap or transport.
+ *
+ * Fanout soft rings:
+ *
+ * On a multithreaded system, we can assign more CPU and multi thread
+ * the stack by creating a soft ring per CPU and spreading traffic
+ * based on a hash computed on src IP etc. Since we still need to
+ * keep the protocol separation, we create a set of 3 soft ring per
+ * CPU (specified by cpu list or degree of fanout).
+ *
+ * NOTE: See the block level comment on top of mac_sched.c
+ */
+
+#include <sys/types.h>
+#include <sys/callb.h>
+#include <sys/sdt.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/vlan.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ip_impl.h>
+#include <inet/sadb.h>
+#include <inet/ipsecesp.h>
+#include <inet/ipsecah.h>
+
+#include <sys/mac_impl.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/mac_flow_impl.h>
+
+static void mac_rx_soft_ring_drain(mac_soft_ring_t *);
+static void mac_soft_ring_fire(void *);
+static void mac_soft_ring_worker(mac_soft_ring_t *);
+static void mac_tx_soft_ring_drain(mac_soft_ring_t *);
+
+uint32_t mac_tx_soft_ring_max_q_cnt = 100000;
+uint32_t mac_tx_soft_ring_hiwat = 1000;
+
+extern kmem_cache_t *mac_soft_ring_cache;
+
+#define	ADD_SOFTRING_TO_SET(mac_srs, softring) {			\
+	if (mac_srs->srs_soft_ring_head == NULL) {			\
+		mac_srs->srs_soft_ring_head = softring;			\
+		mac_srs->srs_soft_ring_tail = softring;			\
+	} else {							\
+		/* ADD to the list */					\
+		softring->s_ring_prev =					\
+			mac_srs->srs_soft_ring_tail;			\
+		mac_srs->srs_soft_ring_tail->s_ring_next = softring;	\
+		mac_srs->srs_soft_ring_tail = softring;			\
+	}								\
+	mac_srs->srs_soft_ring_count++;					\
+}
+
+/*
+ * mac_soft_ring_worker_wakeup
+ *
+ * Wake up the soft ring worker thread to process the queue as long
+ * as no one else is processing it and upper layer (client) is still
+ * ready to receive packets.
+ */
+void
+mac_soft_ring_worker_wakeup(mac_soft_ring_t *ringp)
+{
+	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
+	if (!(ringp->s_ring_state & S_RING_PROC) &&
+	    !(ringp->s_ring_state & S_RING_BLANK) &&
+	    (ringp->s_ring_tid == NULL)) {
+		if (ringp->s_ring_wait != 0) {
+			ringp->s_ring_tid =
+			    timeout(mac_soft_ring_fire, ringp,
+			    ringp->s_ring_wait);
+		} else {
+			/* Schedule the worker thread. */
+			cv_signal(&ringp->s_ring_async);
+		}
+	}
+}
+
+/*
+ * mac_soft_ring_create
+ *
+ * Create a soft ring, do the necessary setup and bind the worker
+ * thread to the assigned CPU.
+ */
+mac_soft_ring_t *
+mac_soft_ring_create(int id, clock_t wait, void *flent, uint16_t type,
+    pri_t pri, mac_client_impl_t *mcip, mac_soft_ring_set_t *mac_srs,
+    processorid_t cpuid, mac_direct_rx_t rx_func, void *x_arg1,
+    mac_resource_handle_t x_arg2)
+{
+	mac_soft_ring_t 	*ringp;
+	char 			name[64];
+
+	bzero(name, 64);
+	ringp = kmem_cache_alloc(mac_soft_ring_cache, KM_SLEEP);
+
+	if (type & ST_RING_TCP) {
+		(void) snprintf(name, sizeof (name),
+		    "mac_tcp_soft_ring_%d_%p", id, mac_srs);
+	} else if (type & ST_RING_UDP) {
+		(void) snprintf(name, sizeof (name),
+		    "mac_udp_soft_ring_%d_%p", id, mac_srs);
+	} else {
+		(void) snprintf(name, sizeof (name),
+		    "mac_oth_soft_ring_%d_%p", id, mac_srs);
+	}
+
+	bzero(ringp, sizeof (mac_soft_ring_t));
+	(void) strncpy(ringp->s_ring_name, name, S_RING_NAMELEN + 1);
+	ringp->s_ring_name[S_RING_NAMELEN] = '\0';
+	mutex_init(&ringp->s_ring_lock, NULL, MUTEX_DEFAULT, NULL);
+	ringp->s_ring_notify_cb_info.mcbi_lockp = &ringp->s_ring_lock;
+
+	ringp->s_ring_type = type;
+	ringp->s_ring_wait = MSEC_TO_TICK(wait);
+	ringp->s_ring_mcip = mcip;
+	ringp->s_ring_set = mac_srs;
+	ringp->s_ring_flent = flent;
+
+	/*
+	 * Protect against access from DR callbacks (mac_walk_srs_bind/unbind)
+	 * which can't grab the mac perimeter
+	 */
+	mutex_enter(&mac_srs->srs_lock);
+	ADD_SOFTRING_TO_SET(mac_srs, ringp);
+	mutex_exit(&mac_srs->srs_lock);
+
+	/*
+	 * set the bind CPU to -1 to indicate
+	 * no thread affinity set
+	 */
+	ringp->s_ring_cpuid = ringp->s_ring_cpuid_save = -1;
+	ringp->s_ring_worker = thread_create(NULL, 0,
+	    mac_soft_ring_worker, ringp, 0, &p0, TS_RUN, pri);
+	if (type & ST_RING_TX) {
+		ringp->s_ring_drain_func = mac_tx_soft_ring_drain;
+		ringp->s_ring_tx_arg1 = x_arg1;
+		ringp->s_ring_tx_arg2 = x_arg2;
+		ringp->s_ring_tx_max_q_cnt = mac_tx_soft_ring_max_q_cnt;
+		ringp->s_ring_tx_hiwat =
+		    (mac_tx_soft_ring_hiwat > mac_tx_soft_ring_max_q_cnt) ?
+		    mac_tx_soft_ring_max_q_cnt : mac_tx_soft_ring_hiwat;
+	} else {
+		ringp->s_ring_drain_func = mac_rx_soft_ring_drain;
+		ringp->s_ring_rx_func = rx_func;
+		ringp->s_ring_rx_arg1 = x_arg1;
+		ringp->s_ring_rx_arg2 = x_arg2;
+	}
+	if (cpuid != -1)
+		(void) mac_soft_ring_bind(ringp, cpuid);
+
+	return (ringp);
+}
+
+/*
+ * mac_soft_ring_free
+ *
+ * Free the soft ring once we are done with it.
+ */
+void
+mac_soft_ring_free(mac_soft_ring_t *softring, boolean_t release_tx_ring)
+{
+	ASSERT((softring->s_ring_state &
+	    (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) ==
+	    (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE));
+	mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE);
+	if (release_tx_ring && softring->s_ring_tx_arg2 != NULL) {
+		ASSERT(softring->s_ring_type & ST_RING_TX);
+		mac_release_tx_ring(softring->s_ring_tx_arg2);
+	}
+	if (softring->s_ring_ksp)
+		kstat_delete(softring->s_ring_ksp);
+	mac_callback_free(softring->s_ring_notify_cb_list);
+	kmem_cache_free(mac_soft_ring_cache, softring);
+}
+
+int mac_soft_ring_thread_bind = 1;
+
+/*
+ * mac_soft_ring_bind
+ *
+ * Bind a soft ring worker thread to supplied CPU.
+ */
+cpu_t *
+mac_soft_ring_bind(mac_soft_ring_t *ringp, processorid_t cpuid)
+{
+	cpu_t *cp;
+	boolean_t clear = B_FALSE;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	if (mac_soft_ring_thread_bind == 0) {
+		DTRACE_PROBE1(mac__soft__ring__no__cpu__bound,
+		    mac_soft_ring_t *, ringp);
+		return (NULL);
+	}
+
+	cp = cpu_get(cpuid);
+	if (cp == NULL || !cpu_is_online(cp))
+		return (NULL);
+
+	mutex_enter(&ringp->s_ring_lock);
+	ringp->s_ring_state |= S_RING_BOUND;
+	if (ringp->s_ring_cpuid != -1)
+		clear = B_TRUE;
+	ringp->s_ring_cpuid = cpuid;
+	mutex_exit(&ringp->s_ring_lock);
+
+	if (clear)
+		thread_affinity_clear(ringp->s_ring_worker);
+
+	DTRACE_PROBE2(mac__soft__ring__cpu__bound, mac_soft_ring_t *,
+	    ringp, processorid_t, cpuid);
+
+	thread_affinity_set(ringp->s_ring_worker, cpuid);
+
+	return (cp);
+}
+
+/*
+ * mac_soft_ring_unbind
+ *
+ * Un Bind a soft ring worker thread.
+ */
+void
+mac_soft_ring_unbind(mac_soft_ring_t *ringp)
+{
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	mutex_enter(&ringp->s_ring_lock);
+	if (!(ringp->s_ring_state & S_RING_BOUND)) {
+		ASSERT(ringp->s_ring_cpuid == -1);
+		mutex_exit(&ringp->s_ring_lock);
+		return;
+	}
+
+	ringp->s_ring_cpuid = -1;
+	ringp->s_ring_state &= ~S_RING_BOUND;
+	thread_affinity_clear(ringp->s_ring_worker);
+	mutex_exit(&ringp->s_ring_lock);
+}
+
+/*
+ * PRIVATE FUNCTIONS
+ */
+
+static void
+mac_soft_ring_fire(void *arg)
+{
+	mac_soft_ring_t	*ringp = arg;
+
+	mutex_enter(&ringp->s_ring_lock);
+	if (ringp->s_ring_tid == 0) {
+		mutex_exit(&ringp->s_ring_lock);
+		return;
+	}
+
+	ringp->s_ring_tid = 0;
+
+	if (!(ringp->s_ring_state & S_RING_PROC)) {
+		cv_signal(&ringp->s_ring_async);
+	}
+	mutex_exit(&ringp->s_ring_lock);
+}
+
+/*
+ * mac_rx_soft_ring_drain
+ *
+ * Called when worker thread model (ST_RING_WORKER_ONLY) of processing
+ * incoming packets is used. s_ring_first contain the queued packets.
+ * s_ring_rx_func contains the upper level (client) routine where the
+ * packets are destined and s_ring_rx_arg1/s_ring_rx_arg2 are the
+ * cookie meant for the client.
+ */
+/* ARGSUSED */
+static void
+mac_rx_soft_ring_drain(mac_soft_ring_t *ringp)
+{
+	mblk_t		*mp;
+	void		*arg1;
+	mac_resource_handle_t arg2;
+	timeout_id_t 	tid;
+	mac_direct_rx_t	proc;
+	size_t		sz;
+	int		cnt;
+	mac_soft_ring_set_t	*mac_srs = ringp->s_ring_set;
+
+	ringp->s_ring_run = curthread;
+	ASSERT(mutex_owned(&ringp->s_ring_lock));
+	ASSERT(!(ringp->s_ring_state & S_RING_PROC));
+
+	if ((tid = ringp->s_ring_tid) != 0)
+		ringp->s_ring_tid = 0;
+
+	ringp->s_ring_state |= S_RING_PROC;
+
+	proc = ringp->s_ring_rx_func;
+	arg1 = ringp->s_ring_rx_arg1;
+	arg2 = ringp->s_ring_rx_arg2;
+
+	while ((ringp->s_ring_first != NULL) &&
+	    !(ringp->s_ring_state & S_RING_PAUSE)) {
+		mp = ringp->s_ring_first;
+		ringp->s_ring_first = NULL;
+		ringp->s_ring_last = NULL;
+		cnt = ringp->s_ring_count;
+		ringp->s_ring_count = 0;
+		sz = ringp->s_ring_size;
+		ringp->s_ring_size = 0;
+		mutex_exit(&ringp->s_ring_lock);
+
+		if (tid != 0) {
+			(void) untimeout(tid);
+			tid = 0;
+		}
+
+		(*proc)(arg1, arg2, mp, NULL);
+
+		/*
+		 * If we have a soft ring set which is doing
+		 * bandwidth control, we need to decrement its
+		 * srs_size so it can have a accurate idea of
+		 * what is the real data queued between SRS and
+		 * its soft rings. We decrement the size for a
+		 * packet only when it gets processed by both
+		 * SRS and the soft ring.
+		 */
+		mutex_enter(&mac_srs->srs_lock);
+		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
+		MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
+		mutex_exit(&mac_srs->srs_lock);
+
+		mutex_enter(&ringp->s_ring_lock);
+	}
+	ringp->s_ring_state &= ~S_RING_PROC;
+	if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
+		cv_signal(&ringp->s_ring_client_cv);
+	ringp->s_ring_run = NULL;
+}
+
+/*
+ * mac_soft_ring_worker
+ *
+ * The soft ring worker routine to process any queued packets. In
+ * normal case, the worker thread is bound to a CPU. It the soft
+ * ring is dealing with TCP packets, then the worker thread will
+ * be bound to the same CPU as the TCP squeue.
+ */
+static void
+mac_soft_ring_worker(mac_soft_ring_t *ringp)
+{
+	kmutex_t *lock = &ringp->s_ring_lock;
+	kcondvar_t *async = &ringp->s_ring_async;
+	mac_soft_ring_set_t *srs = ringp->s_ring_set;
+	callb_cpr_t cprinfo;
+
+	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_soft_ring");
+	mutex_enter(lock);
+start:
+	for (;;) {
+		while (((ringp->s_ring_first == NULL ||
+		    (ringp->s_ring_state & S_RING_BLOCK)) &&
+		    !(ringp->s_ring_state & S_RING_PAUSE)) ||
+		    (ringp->s_ring_state & S_RING_PROC)) {
+
+			CALLB_CPR_SAFE_BEGIN(&cprinfo);
+			cv_wait(async, lock);
+			CALLB_CPR_SAFE_END(&cprinfo, lock);
+		}
+
+		/*
+		 * Either we have work to do, or we have been asked to
+		 * shutdown temporarily or permanently
+		 */
+		if (ringp->s_ring_state & S_RING_PAUSE)
+			goto done;
+
+		ringp->s_ring_drain_func(ringp);
+	}
+done:
+	mutex_exit(lock);
+	mutex_enter(&srs->srs_lock);
+	mutex_enter(lock);
+
+	ringp->s_ring_state |= S_RING_QUIESCE_DONE;
+	if (!(ringp->s_ring_state & S_RING_CONDEMNED)) {
+		srs->srs_soft_ring_quiesced_count++;
+		cv_broadcast(&srs->srs_async);
+		mutex_exit(&srs->srs_lock);
+		while (!(ringp->s_ring_state &
+		    (S_RING_RESTART | S_RING_CONDEMNED)))
+			cv_wait(&ringp->s_ring_async, &ringp->s_ring_lock);
+		mutex_exit(lock);
+		mutex_enter(&srs->srs_lock);
+		mutex_enter(lock);
+		srs->srs_soft_ring_quiesced_count--;
+		if (ringp->s_ring_state & S_RING_RESTART) {
+			ASSERT(!(ringp->s_ring_state & S_RING_CONDEMNED));
+			ringp->s_ring_state &= ~(S_RING_RESTART |
+			    S_RING_QUIESCE | S_RING_QUIESCE_DONE);
+			cv_broadcast(&srs->srs_async);
+			mutex_exit(&srs->srs_lock);
+			goto start;
+		}
+	}
+	ASSERT(ringp->s_ring_state & S_RING_CONDEMNED);
+	ringp->s_ring_state |= S_RING_CONDEMNED_DONE;
+	CALLB_CPR_EXIT(&cprinfo);
+	srs->srs_soft_ring_condemned_count++;
+	cv_broadcast(&srs->srs_async);
+	mutex_exit(&srs->srs_lock);
+	thread_exit();
+}
+
+/*
+ * mac_soft_ring_intr_enable and mac_soft_ring_intr_disable
+ *
+ * these functions are called to toggle the sending of packets to the
+ * client. They are called by the client. the client gets the name
+ * of these routine and corresponding cookie (pointing to softring)
+ * during capability negotiation at setup time.
+ *
+ * Enabling is allow the processing thread to send packets to the
+ * client while disabling does the opposite.
+ */
+void
+mac_soft_ring_intr_enable(void *arg)
+{
+	mac_soft_ring_t *ringp = (mac_soft_ring_t *)arg;
+	mutex_enter(&ringp->s_ring_lock);
+	ringp->s_ring_state &= ~S_RING_BLANK;
+	if (ringp->s_ring_first != NULL)
+		mac_soft_ring_worker_wakeup(ringp);
+	mutex_exit(&ringp->s_ring_lock);
+}
+
+void
+mac_soft_ring_intr_disable(void *arg)
+{
+	mac_soft_ring_t *ringp = (mac_soft_ring_t *)arg;
+	/*
+	 * Stop worker thread from sending packets above.
+	 * Squeue will poll soft ring when it needs packets.
+	 */
+	mutex_enter(&ringp->s_ring_lock);
+	ringp->s_ring_state |= S_RING_BLANK;
+	mutex_exit(&ringp->s_ring_lock);
+}
+
+/*
+ * mac_soft_ring_poll
+ *
+ * This routine is called by the client to poll for packets from
+ * the soft ring. The function name and cookie corresponding to
+ * the soft ring is exchanged during capability negotiation during
+ * setup.
+ */
+mblk_t *
+mac_soft_ring_poll(mac_soft_ring_t *ringp, int bytes_to_pickup)
+{
+	mblk_t	*head, *tail;
+	mblk_t	*mp;
+	size_t	sz = 0;
+	int	cnt = 0;
+	mac_soft_ring_set_t	*mac_srs = ringp->s_ring_set;
+
+	ASSERT(mac_srs != NULL);
+
+	mutex_enter(&ringp->s_ring_lock);
+	head = tail = mp = ringp->s_ring_first;
+	if (head == NULL) {
+		mutex_exit(&ringp->s_ring_lock);
+		return (NULL);
+	}
+
+	if (ringp->s_ring_size <= bytes_to_pickup) {
+		head = ringp->s_ring_first;
+		ringp->s_ring_first = NULL;
+		ringp->s_ring_last = NULL;
+		cnt = ringp->s_ring_count;
+		ringp->s_ring_count = 0;
+		sz = ringp->s_ring_size;
+		ringp->s_ring_size = 0;
+	} else {
+		while (mp && sz <= bytes_to_pickup) {
+			sz += msgdsize(mp);
+			cnt++;
+			tail = mp;
+			mp = mp->b_next;
+		}
+		ringp->s_ring_count -= cnt;
+		ringp->s_ring_size -= sz;
+		tail->b_next = NULL;
+		if (mp == NULL) {
+			ringp->s_ring_first = NULL;
+			ringp->s_ring_last = NULL;
+			ASSERT(ringp->s_ring_count == 0);
+		} else {
+			ringp->s_ring_first = mp;
+		}
+	}
+
+	mutex_exit(&ringp->s_ring_lock);
+	/*
+	 * Update the shared count and size counters so
+	 * that SRS has a accurate idea of queued packets.
+	 */
+	mutex_enter(&mac_srs->srs_lock);
+	MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
+	MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
+	mutex_exit(&mac_srs->srs_lock);
+	return (head);
+}
+
+/*
+ * mac_soft_ring_dls_bypass
+ *
+ * Enable direct client (IP) callback function from the softrings.
+ * Callers need to make sure they don't need any DLS layer processing
+ */
+void
+mac_soft_ring_dls_bypass(void *arg, mac_direct_rx_t rx_func, void *rx_arg1)
+{
+	mac_soft_ring_t		*softring = arg;
+	mac_soft_ring_set_t	*srs;
+
+	ASSERT(rx_func != NULL);
+
+	mutex_enter(&softring->s_ring_lock);
+	softring->s_ring_rx_func = rx_func;
+	softring->s_ring_rx_arg1 = rx_arg1;
+	mutex_exit(&softring->s_ring_lock);
+
+	srs = softring->s_ring_set;
+	mutex_enter(&srs->srs_lock);
+	srs->srs_type |= SRST_DLS_BYPASS;
+	mutex_exit(&srs->srs_lock);
+}
+
+/*
+ * mac_soft_ring_signal
+ *
+ * Typically used to set the soft ring state to QUIESCE, CONDEMNED, or
+ * RESTART.
+ *
+ * In the Rx side, the quiescing is done bottom up. After the Rx upcalls
+ * from the driver are done, then the Rx SRS is quiesced and only then can
+ * we signal the soft rings. Thus this function can't be called arbitrarily
+ * without satisfying the prerequisites. On the Tx side, the threads from
+ * top need to quiesced, then the Tx SRS and only then can we signal the
+ * Tx soft rings.
+ */
+void
+mac_soft_ring_signal(mac_soft_ring_t *softring, uint_t sr_flag)
+{
+	mutex_enter(&softring->s_ring_lock);
+	softring->s_ring_state |= sr_flag;
+	cv_signal(&softring->s_ring_async);
+	mutex_exit(&softring->s_ring_lock);
+}
+
+/*
+ * mac_tx_soft_ring_drain
+ *
+ * The transmit side drain routine in case the soft ring was being
+ * used to transmit packets.
+ */
+static void
+mac_tx_soft_ring_drain(mac_soft_ring_t *ringp)
+{
+	mblk_t 			*mp;
+	void 			*arg1;
+	void 			*arg2;
+	mblk_t 			*tail;
+	uint_t			saved_pkt_count, saved_size;
+	boolean_t		is_subflow;
+	mac_tx_stats_t		stats;
+	mac_soft_ring_set_t	*mac_srs = ringp->s_ring_set;
+
+	saved_pkt_count = saved_size = 0;
+	ringp->s_ring_run = curthread;
+	ASSERT(mutex_owned(&ringp->s_ring_lock));
+	ASSERT(!(ringp->s_ring_state & S_RING_PROC));
+
+	ringp->s_ring_state |= S_RING_PROC;
+	is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
+	arg1 = ringp->s_ring_tx_arg1;
+	arg2 = ringp->s_ring_tx_arg2;
+
+	while (ringp->s_ring_first != NULL) {
+		mp = ringp->s_ring_first;
+		tail = ringp->s_ring_last;
+		saved_pkt_count = ringp->s_ring_count;
+		saved_size = ringp->s_ring_size;
+		ringp->s_ring_first = NULL;
+		ringp->s_ring_last = NULL;
+		ringp->s_ring_count = 0;
+		ringp->s_ring_size = 0;
+		mutex_exit(&ringp->s_ring_lock);
+
+		mp = mac_tx_send(arg1, arg2, mp, &stats);
+
+		mutex_enter(&ringp->s_ring_lock);
+		if (mp != NULL) {
+			/* Device out of tx desc, set block */
+			tail->b_next = ringp->s_ring_first;
+			ringp->s_ring_first = mp;
+			ringp->s_ring_count +=
+			    (saved_pkt_count - stats.ts_opackets);
+			ringp->s_ring_size += (saved_size - stats.ts_obytes);
+			if (ringp->s_ring_last == NULL)
+				ringp->s_ring_last = tail;
+
+			if (ringp->s_ring_tx_woken_up) {
+				ringp->s_ring_tx_woken_up = B_FALSE;
+			} else {
+				ringp->s_ring_state |= S_RING_BLOCK;
+				ringp->s_ring_blocked_cnt++;
+			}
+
+			ringp->s_ring_state &= ~S_RING_PROC;
+			ringp->s_ring_run = NULL;
+			return;
+		} else {
+			ringp->s_ring_tx_woken_up = B_FALSE;
+			if (is_subflow) {
+				FLOW_TX_STATS_UPDATE(
+				    mac_srs->srs_flent, &stats);
+			}
+		}
+	}
+
+	if (ringp->s_ring_count == 0 && ringp->s_ring_state &
+	    (S_RING_TX_HIWAT | S_RING_WAKEUP_CLIENT | S_RING_ENQUEUED)) {
+		mac_tx_notify_cb_t *mtnfp;
+		mac_cb_t *mcb;
+		mac_client_impl_t *mcip =  ringp->s_ring_mcip;
+		boolean_t wakeup_required = B_FALSE;
+
+		if (ringp->s_ring_state &
+		    (S_RING_TX_HIWAT|S_RING_WAKEUP_CLIENT)) {
+			wakeup_required = B_TRUE;
+		}
+		ringp->s_ring_state &=
+		    ~(S_RING_TX_HIWAT | S_RING_WAKEUP_CLIENT | S_RING_ENQUEUED);
+		mutex_exit(&ringp->s_ring_lock);
+		if (wakeup_required) {
+			/* Wakeup callback registered clients */
+			MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
+			for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
+			    mcb = mcb->mcb_nextp) {
+				mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
+				mtnfp->mtnf_fn(mtnfp->mtnf_arg,
+				    (mac_tx_cookie_t)ringp);
+			}
+			MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
+			    &mcip->mci_tx_notify_cb_list);
+			/*
+			 * If the client is not the primary MAC client, then we
+			 * need to send the notification to the clients upper
+			 * MAC, i.e. mci_upper_mip.
+			 */
+			mac_tx_notify(mcip->mci_upper_mip != NULL ?
+			    mcip->mci_upper_mip : mcip->mci_mip);
+		}
+		mutex_enter(&ringp->s_ring_lock);
+	}
+	ringp->s_ring_state &= ~S_RING_PROC;
+	ringp->s_ring_run = NULL;
+}
diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c
new file mode 100644
index 0000000000..1615060736
--- /dev/null
+++ b/usr/src/uts/common/io/mac/mac_util.c
@@ -0,0 +1,823 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * MAC Services Module - misc utilities
+ */
+
+#include <sys/types.h>
+#include <sys/mac.h>
+#include <sys/mac_impl.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_client_impl.h>
+#include <sys/mac_soft_ring.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/vlan.h>
+#include <sys/pattr.h>
+#include <sys/pci_tools.h>
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <sys/vtrace.h>
+#include <sys/dlpi.h>
+#include <sys/sunndi.h>
+
+/*
+ * Copy an mblk, preserving its hardware checksum flags.
+ */
+static mblk_t *
+mac_copymsg_cksum(mblk_t *mp)
+{
+	mblk_t *mp1;
+	uint32_t start, stuff, end, value, flags;
+
+	mp1 = copymsg(mp);
+	if (mp1 == NULL)
+		return (NULL);
+
+	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
+	(void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
+	    flags, KM_NOSLEEP);
+
+	return (mp1);
+}
+
+/*
+ * Copy an mblk chain, presenting the hardware checksum flags of the
+ * individual mblks.
+ */
+mblk_t *
+mac_copymsgchain_cksum(mblk_t *mp)
+{
+	mblk_t *nmp = NULL;
+	mblk_t **nmpp = &nmp;
+
+	for (; mp != NULL; mp = mp->b_next) {
+		if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
+			freemsgchain(nmp);
+			return (NULL);
+		}
+
+		nmpp = &((*nmpp)->b_next);
+	}
+
+	return (nmp);
+}
+
+/*
+ * Process the specified mblk chain for proper handling of hardware
+ * checksum offload. This routine is invoked for loopback traffic
+ * between MAC clients.
+ * The function handles a NULL mblk chain passed as argument.
+ */
+mblk_t *
+mac_fix_cksum(mblk_t *mp_chain)
+{
+	mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
+	uint32_t flags, start, stuff, end, value;
+
+	for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
+		uint16_t len;
+		uint32_t offset;
+		struct ether_header *ehp;
+		uint16_t sap;
+
+		hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
+		    &flags);
+		if (flags == 0)
+			continue;
+
+		/*
+		 * Since the processing of checksum offload for loopback
+		 * traffic requires modification of the packet contents,
+		 * ensure sure that we are always modifying our own copy.
+		 */
+		if (DB_REF(mp) > 1) {
+			mp1 = copymsg(mp);
+			if (mp1 == NULL)
+				continue;
+			mp1->b_next = mp->b_next;
+			mp->b_next = NULL;
+			freemsg(mp);
+			if (prev != NULL)
+				prev->b_next = mp1;
+			else
+				new_chain = mp1;
+			mp = mp1;
+		}
+
+		/*
+		 * Ethernet, and optionally VLAN header.
+		 */
+		/* LINTED: improper alignment cast */
+		ehp = (struct ether_header *)mp->b_rptr;
+		if (ntohs(ehp->ether_type) == VLAN_TPID) {
+			struct ether_vlan_header *evhp;
+
+			ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
+			/* LINTED: improper alignment cast */
+			evhp = (struct ether_vlan_header *)mp->b_rptr;
+			sap = ntohs(evhp->ether_type);
+			offset = sizeof (struct ether_vlan_header);
+		} else {
+			sap = ntohs(ehp->ether_type);
+			offset = sizeof (struct ether_header);
+		}
+
+		if (MBLKL(mp) <= offset) {
+			offset -= MBLKL(mp);
+			if (mp->b_cont == NULL) {
+				/* corrupted packet, skip it */
+				if (prev != NULL)
+					prev->b_next = mp->b_next;
+				else
+					new_chain = mp->b_next;
+				mp1 = mp->b_next;
+				mp->b_next = NULL;
+				freemsg(mp);
+				mp = mp1;
+				continue;
+			}
+			mp = mp->b_cont;
+		}
+
+		if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
+			ipha_t *ipha = NULL;
+
+			/*
+			 * In order to compute the full and header
+			 * checksums, we need to find and parse
+			 * the IP and/or ULP headers.
+			 */
+
+			sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
+
+			/*
+			 * IP header.
+			 */
+			if (sap != ETHERTYPE_IP)
+				continue;
+
+			ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
+			/* LINTED: improper alignment cast */
+			ipha = (ipha_t *)(mp->b_rptr + offset);
+
+			if (flags & HCK_FULLCKSUM) {
+				ipaddr_t src, dst;
+				uint32_t cksum;
+				uint16_t *up;
+				uint8_t proto;
+
+				/*
+				 * Pointer to checksum field in ULP header.
+				 */
+				proto = ipha->ipha_protocol;
+				ASSERT(ipha->ipha_version_and_hdr_length ==
+				    IP_SIMPLE_HDR_VERSION);
+				if (proto == IPPROTO_TCP) {
+					/* LINTED: improper alignment cast */
+					up = IPH_TCPH_CHECKSUMP(ipha,
+					    IP_SIMPLE_HDR_LENGTH);
+				} else {
+					ASSERT(proto == IPPROTO_UDP);
+					/* LINTED: improper alignment cast */
+					up = IPH_UDPH_CHECKSUMP(ipha,
+					    IP_SIMPLE_HDR_LENGTH);
+				}
+
+				/*
+				 * Pseudo-header checksum.
+				 */
+				src = ipha->ipha_src;
+				dst = ipha->ipha_dst;
+				len = ntohs(ipha->ipha_length) -
+				    IP_SIMPLE_HDR_LENGTH;
+
+				cksum = (dst >> 16) + (dst & 0xFFFF) +
+				    (src >> 16) + (src & 0xFFFF);
+				cksum += htons(len);
+
+				/*
+				 * The checksum value stored in the packet needs
+				 * to be correct. Compute it here.
+				 */
+				*up = 0;
+				cksum += (((proto) == IPPROTO_UDP) ?
+				    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
+				cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
+				    offset, cksum);
+				*(up) = (uint16_t)(cksum ? cksum : ~cksum);
+
+				flags |= HCK_FULLCKSUM_OK;
+				value = 0xffff;
+			}
+
+			if (flags & HCK_IPV4_HDRCKSUM) {
+				ASSERT(ipha != NULL);
+				ipha->ipha_hdr_checksum =
+				    (uint16_t)ip_csum_hdr(ipha);
+			}
+		}
+
+		if (flags & HCK_PARTIALCKSUM) {
+			uint16_t *up, partial, cksum;
+			uchar_t *ipp; /* ptr to beginning of IP header */
+
+			if (mp->b_cont != NULL) {
+				mblk_t *mp1;
+
+				mp1 = msgpullup(mp, offset + end);
+				if (mp1 == NULL)
+					continue;
+				mp1->b_next = mp->b_next;
+				mp->b_next = NULL;
+				freemsg(mp);
+				if (prev != NULL)
+					prev->b_next = mp1;
+				else
+					new_chain = mp1;
+				mp = mp1;
+			}
+
+			ipp = mp->b_rptr + offset;
+			/* LINTED: cast may result in improper alignment */
+			up = (uint16_t *)((uchar_t *)ipp + stuff);
+			partial = *up;
+			*up = 0;
+
+			cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
+			    end - start, partial);
+			cksum = ~cksum;
+			*up = cksum ? cksum : ~cksum;
+
+			/*
+			 * Since we already computed the whole checksum,
+			 * indicate to the stack that it has already
+			 * been verified by the hardware.
+			 */
+			flags &= ~HCK_PARTIALCKSUM;
+			flags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
+			value = 0xffff;
+		}
+
+		(void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
+		    value, flags, KM_NOSLEEP);
+	}
+
+	return (new_chain);
+}
+
+/*
+ * Add VLAN tag to the specified mblk.
+ */
+mblk_t *
+mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
+{
+	mblk_t *hmp;
+	struct ether_vlan_header *evhp;
+	struct ether_header *ehp;
+	uint32_t start, stuff, end, value, flags;
+
+	ASSERT(pri != 0 || vid != 0);
+
+	/*
+	 * Allocate an mblk for the new tagged ethernet header,
+	 * and copy the MAC addresses and ethertype from the
+	 * original header.
+	 */
+
+	hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
+	if (hmp == NULL) {
+		freemsg(mp);
+		return (NULL);
+	}
+
+	evhp = (struct ether_vlan_header *)hmp->b_rptr;
+	ehp = (struct ether_header *)mp->b_rptr;
+
+	bcopy(ehp, evhp, (ETHERADDRL * 2));
+	evhp->ether_type = ehp->ether_type;
+	evhp->ether_tpid = htons(ETHERTYPE_VLAN);
+
+	hmp->b_wptr += sizeof (struct ether_vlan_header);
+	mp->b_rptr += sizeof (struct ether_header);
+
+	/*
+	 * Free the original message if it's now empty. Link the
+	 * rest of messages to the header message.
+	 */
+	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
+	(void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags,
+	    KM_NOSLEEP);
+	if (MBLKL(mp) == 0) {
+		hmp->b_cont = mp->b_cont;
+		freeb(mp);
+	} else {
+		hmp->b_cont = mp;
+	}
+	ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
+
+	/*
+	 * Initialize the new TCI (Tag Control Information).
+	 */
+	evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
+
+	return (hmp);
+}
+
+/*
+ * Adds a VLAN tag with the specified VID and priority to each mblk of
+ * the specified chain.
+ */
+mblk_t *
+mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
+{
+	mblk_t *next_mp, **prev, *mp;
+
+	mp = mp_chain;
+	prev = &mp_chain;
+
+	while (mp != NULL) {
+		next_mp = mp->b_next;
+		mp->b_next = NULL;
+		if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
+			freemsgchain(next_mp);
+			break;
+		}
+		*prev = mp;
+		prev = &mp->b_next;
+		mp = mp->b_next = next_mp;
+	}
+
+	return (mp_chain);
+}
+
+/*
+ * Strip VLAN tag
+ */
+mblk_t *
+mac_strip_vlan_tag(mblk_t *mp)
+{
+	mblk_t *newmp;
+	struct ether_vlan_header *evhp;
+
+	evhp = (struct ether_vlan_header *)mp->b_rptr;
+	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
+		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
+
+		if (DB_REF(mp) > 1) {
+			newmp = copymsg(mp);
+			if (newmp == NULL)
+				return (NULL);
+			freemsg(mp);
+			mp = newmp;
+		}
+
+		evhp = (struct ether_vlan_header *)mp->b_rptr;
+
+		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
+		mp->b_rptr += VLAN_TAGSZ;
+	}
+	return (mp);
+}
+
+/*
+ * Strip VLAN tag from each mblk of the chain.
+ */
+mblk_t *
+mac_strip_vlan_tag_chain(mblk_t *mp_chain)
+{
+	mblk_t *mp, *next_mp, **prev;
+
+	mp = mp_chain;
+	prev = &mp_chain;
+
+	while (mp != NULL) {
+		next_mp = mp->b_next;
+		mp->b_next = NULL;
+		if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
+			freemsgchain(next_mp);
+			break;
+		}
+		*prev = mp;
+		prev = &mp->b_next;
+		mp = mp->b_next = next_mp;
+	}
+
+	return (mp_chain);
+}
+
+/*
+ * Default callback function. Used when the datapath is not yet initialized.
+ */
+/* ARGSUSED */
+void
+mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
+    boolean_t loopback)
+{
+	mblk_t	*mp1 = mp;
+
+	while (mp1 != NULL) {
+		mp1->b_prev = NULL;
+		mp1->b_queue = NULL;
+		mp1 = mp1->b_next;
+	}
+	freemsgchain(mp);
+}
+
+/*
+ * Determines the IPv6 header length accounting for all the optional IPv6
+ * headers (hop-by-hop, destination, routing and fragment). The header length
+ * and next header value (a transport header) is captured.
+ *
+ * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
+ * returns B_TRUE.
+ */
+boolean_t
+mac_ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length,
+    uint8_t *next_hdr)
+{
+	uint16_t length;
+	uint_t	ehdrlen;
+	uint8_t *whereptr;
+	uint8_t *endptr;
+	uint8_t *nexthdrp;
+	ip6_dest_t *desthdr;
+	ip6_rthdr_t *rthdr;
+	ip6_frag_t *fraghdr;
+
+	endptr = mp->b_wptr;
+	if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
+		return (B_FALSE);
+	ASSERT((IPH_HDR_VERSION(ip6h) & ~IP_FORWARD_PROG_BIT) == IPV6_VERSION);
+	length = IPV6_HDR_LEN;
+	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
+
+	nexthdrp = &ip6h->ip6_nxt;
+	while (whereptr < endptr) {
+		/* Is there enough left for len + nexthdr? */
+		if (whereptr + MIN_EHDR_LEN > endptr)
+			break;
+
+		switch (*nexthdrp) {
+		case IPPROTO_HOPOPTS:
+		case IPPROTO_DSTOPTS:
+			/* Assumes the headers are identical for hbh and dst */
+			desthdr = (ip6_dest_t *)whereptr;
+			ehdrlen = 8 * (desthdr->ip6d_len + 1);
+			if ((uchar_t *)desthdr +  ehdrlen > endptr)
+				return (B_FALSE);
+			nexthdrp = &desthdr->ip6d_nxt;
+			break;
+		case IPPROTO_ROUTING:
+			rthdr = (ip6_rthdr_t *)whereptr;
+			ehdrlen =  8 * (rthdr->ip6r_len + 1);
+			if ((uchar_t *)rthdr +  ehdrlen > endptr)
+				return (B_FALSE);
+			nexthdrp = &rthdr->ip6r_nxt;
+			break;
+		case IPPROTO_FRAGMENT:
+			fraghdr = (ip6_frag_t *)whereptr;
+			ehdrlen = sizeof (ip6_frag_t);
+			if ((uchar_t *)&fraghdr[1] > endptr)
+				return (B_FALSE);
+			nexthdrp = &fraghdr->ip6f_nxt;
+			break;
+		case IPPROTO_NONE:
+			/* No next header means we're finished */
+		default:
+			*hdr_length = length;
+			*next_hdr = *nexthdrp;
+			return (B_TRUE);
+		}
+		length += ehdrlen;
+		whereptr += ehdrlen;
+		*hdr_length = length;
+		*next_hdr = *nexthdrp;
+	}
+	switch (*nexthdrp) {
+	case IPPROTO_HOPOPTS:
+	case IPPROTO_DSTOPTS:
+	case IPPROTO_ROUTING:
+	case IPPROTO_FRAGMENT:
+		/*
+		 * If any know extension headers are still to be processed,
+		 * the packet's malformed (or at least all the IP header(s) are
+		 * not in the same mblk - and that should never happen.
+		 */
+		return (B_FALSE);
+
+	default:
+		/*
+		 * If we get here, we know that all of the IP headers were in
+		 * the same mblk, even if the ULP header is in the next mblk.
+		 */
+		*hdr_length = length;
+		*next_hdr = *nexthdrp;
+		return (B_TRUE);
+	}
+}
+
+typedef struct mac_dladm_intr {
+	int	ino;
+	int	cpu_id;
+	char	driver_path[MAXPATHLEN];
+	char	nexus_path[MAXPATHLEN];
+} mac_dladm_intr_t;
+
+/* Bind the interrupt to cpu_num */
+static int
+mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int ino)
+{
+	pcitool_intr_set_t	iset;
+	int			err;
+
+	iset.ino = ino;
+	iset.cpu_id = cpu_num;
+	iset.user_version = PCITOOL_VERSION;
+	err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
+	    kcred, NULL);
+
+	return (err);
+}
+
+/*
+ * Search interrupt information. iget is filled in with the info to search
+ */
+static boolean_t
+mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
+{
+	int	i;
+	char	driver_path[2 * MAXPATHLEN];
+
+	for (i = 0; i < iget_p->num_devs; i++) {
+		(void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
+		(void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
+		    ":%s%d", iget_p->dev[i].driver_name,
+		    iget_p->dev[i].dev_inst);
+		/* Match the device path for the device path */
+		if (strcmp(driver_path, dln->driver_path) == 0) {
+			dln->ino = iget_p->ino;
+			dln->cpu_id = iget_p->cpu_id;
+			return (B_TRUE);
+		}
+	}
+	return (B_FALSE);
+}
+
+/*
+ * Get information about ino, i.e. if this is the interrupt for our
+ * device and where it is bound etc.
+ */
+static boolean_t
+mac_get_single_intr(ldi_handle_t lh, int ino, mac_dladm_intr_t *dln)
+{
+	pcitool_intr_get_t	*iget_p;
+	int			ipsz;
+	int			nipsz;
+	int			err;
+	uint8_t			inum;
+
+	/*
+	 * Check if SLEEP is OK, i.e if could come here in response to
+	 * changing the fanout due to some callback from the driver, say
+	 * link speed changes.
+	 */
+	ipsz = PCITOOL_IGET_SIZE(0);
+	iget_p = kmem_zalloc(ipsz, KM_SLEEP);
+
+	iget_p->num_devs_ret = 0;
+	iget_p->user_version = PCITOOL_VERSION;
+	iget_p->ino = ino;
+
+	err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
+	    FKIOCTL, kcred, NULL);
+	if (err != 0) {
+		kmem_free(iget_p, ipsz);
+		return (B_FALSE);
+	}
+	if (iget_p->num_devs == 0) {
+		kmem_free(iget_p, ipsz);
+		return (B_FALSE);
+	}
+	inum = iget_p->num_devs;
+	if (iget_p->num_devs_ret < iget_p->num_devs) {
+		/* Reallocate */
+		nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
+
+		kmem_free(iget_p, ipsz);
+		ipsz = nipsz;
+		iget_p = kmem_zalloc(ipsz, KM_SLEEP);
+
+		iget_p->num_devs_ret = inum;
+		iget_p->ino = ino;
+		iget_p->user_version = PCITOOL_VERSION;
+		err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
+		    FKIOCTL, kcred, NULL);
+		if (err != 0) {
+			kmem_free(iget_p, ipsz);
+			return (B_FALSE);
+		}
+		/* defensive */
+		if (iget_p->num_devs != iget_p->num_devs_ret) {
+			kmem_free(iget_p, ipsz);
+			return (B_FALSE);
+		}
+	}
+
+	if (mac_search_intrinfo(iget_p, dln)) {
+		kmem_free(iget_p, ipsz);
+		return (B_TRUE);
+	}
+	kmem_free(iget_p, ipsz);
+	return (B_FALSE);
+}
+
+/*
+ * Get the interrupts and check each one to see if it is for our device.
+ */
+static int
+mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
+{
+	pcitool_intr_info_t	intr_info;
+	int			err;
+	int			ino;
+
+	err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
+	    FKIOCTL, kcred, NULL);
+	if (err != 0)
+		return (-1);
+
+	for (ino = 0; ino < intr_info.num_intr; ino++) {
+		if (mac_get_single_intr(lh, ino, dln)) {
+			if (dln->cpu_id == cpuid)
+				return (0);
+			return (1);
+		}
+	}
+	return (-1);
+}
+
+/*
+ * Obtain the nexus parent node info. for mdip.
+ */
+static dev_info_t *
+mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
+{
+	struct dev_info		*tdip = (struct dev_info *)mdip;
+	struct ddi_minor_data	*minordata;
+	int			circ;
+	dev_info_t		*pdip;
+	char			pathname[MAXPATHLEN];
+
+	while (tdip != NULL) {
+		ndi_devi_enter((dev_info_t *)tdip, &circ);
+		for (minordata = tdip->devi_minor; minordata != NULL;
+		    minordata = minordata->next) {
+			if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
+			    strlen(DDI_NT_INTRCTL)) == 0) {
+				pdip = minordata->dip;
+				(void) ddi_pathname(pdip, pathname);
+				(void) snprintf(dln->nexus_path, MAXPATHLEN,
+				    "/devices%s:intr", pathname);
+				(void) ddi_pathname_minor(minordata, pathname);
+				ndi_devi_exit((dev_info_t *)tdip, circ);
+				return (pdip);
+			}
+		}
+		ndi_devi_exit((dev_info_t *)tdip, circ);
+		tdip = tdip->devi_parent;
+	}
+	return (NULL);
+}
+
+/*
+ * For a primary MAC client, if the user has set a list or CPUs or
+ * we have obtained it implicitly, we try to retarget the interrupt
+ * for that device on one of the CPUs in the list.
+ * We assign the interrupt to the same CPU as the poll thread.
+ */
+static boolean_t
+mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
+{
+	ldi_handle_t		lh = NULL;
+	ldi_ident_t		li = NULL;
+	int			err;
+	int			ret;
+	mac_dladm_intr_t	dln;
+	dev_info_t		*dip;
+	struct ddi_minor_data	*minordata;
+
+	dln.nexus_path[0] = '\0';
+	dln.driver_path[0] = '\0';
+
+	minordata = ((struct dev_info *)mdip)->devi_minor;
+	while (minordata != NULL) {
+		if (minordata->type == DDM_MINOR)
+			break;
+		minordata = minordata->next;
+	}
+	if (minordata == NULL)
+		return (B_FALSE);
+
+	(void) ddi_pathname_minor(minordata, dln.driver_path);
+
+	dip = mac_get_nexus_node(mdip, &dln);
+	/* defensive */
+	if (dip == NULL)
+		return (B_FALSE);
+
+	err = ldi_ident_from_major(ddi_driver_major(dip), &li);
+	if (err != 0)
+		return (B_FALSE);
+
+	err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
+	if (err != 0)
+		return (B_FALSE);
+
+	ret = mac_validate_intr(lh, &dln, cpuid);
+	if (ret < 0) {
+		(void) ldi_close(lh, FREAD|FWRITE, kcred);
+		return (B_FALSE);
+	}
+	/* cmn_note? */
+	if (ret != 0)
+		if ((err = (mac_set_intr(lh, cpuid, dln.ino))) != 0) {
+			(void) ldi_close(lh, FREAD|FWRITE, kcred);
+			return (B_FALSE);
+		}
+	(void) ldi_close(lh, FREAD|FWRITE, kcred);
+	return (B_TRUE);
+}
+
+void
+mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
+{
+	dev_info_t		*mdip = (dev_info_t *)arg;
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_resource_props_t	*mrp;
+	mac_perim_handle_t	mph;
+
+	if (cpuid == -1 || !mac_check_interrupt_binding(mdip, cpuid))
+		return;
+
+	mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
+	mrp = MCIP_RESOURCE_PROPS(mcip);
+	mrp->mrp_intr_cpu = cpuid;
+	mac_perim_exit(mph);
+}
+
+int32_t
+mac_client_intr_cpu(mac_client_handle_t mch)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+	mac_cpus_t		*srs_cpu;
+	mac_soft_ring_set_t	*rx_srs;
+	flow_entry_t		*flent = mcip->mci_flent;
+	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
+
+	/*
+	 * Check if we need to retarget the interrupt. We do this only
+	 * for the primary MAC client. We do this if we have the only
+	 *  exclusive ring in the group.
+	 */
+	if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
+		rx_srs = flent->fe_rx_srs[1];
+		srs_cpu = &rx_srs->srs_cpu;
+		if (mrp->mrp_intr_cpu == srs_cpu->mc_pollid)
+			return (-1);
+		return (srs_cpu->mc_pollid);
+	}
+	return (-1);
+}
+
+void *
+mac_get_devinfo(mac_handle_t mh)
+{
+	mac_impl_t	*mip = (mac_impl_t *)mh;
+
+	return ((void *)mip->mi_dip);
+}
diff --git a/usr/src/uts/common/io/mac/plugins/mac_ether.c b/usr/src/uts/common/io/mac/plugins/mac_ether.c
index f4cf08eb66..abaab66add 100644
--- a/usr/src/uts/common/io/mac/plugins/mac_ether.c
+++ b/usr/src/uts/common/io/mac/plugins/mac_ether.c
@@ -30,9 +30,8 @@
 #include <sys/types.h>
 #include <sys/modctl.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/dld_impl.h>
 #include <sys/mac_ether.h>
-#include <sys/dls.h>
 #include <sys/ethernet.h>
 #include <sys/byteorder.h>
 #include <sys/strsun.h>
diff --git a/usr/src/uts/common/io/mac/plugins/mac_wifi.c b/usr/src/uts/common/io/mac/plugins/mac_wifi.c
index 668d7dbda1..fb45c8ef1c 100644
--- a/usr/src/uts/common/io/mac/plugins/mac_wifi.c
+++ b/usr/src/uts/common/io/mac/plugins/mac_wifi.c
@@ -32,9 +32,8 @@
 #include <sys/types.h>
 #include <sys/modctl.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/dld_impl.h>
 #include <sys/mac_wifi.h>
-#include <sys/dls.h>
 #include <sys/ethernet.h>
 #include <sys/byteorder.h>
 #include <sys/strsun.h>
diff --git a/usr/src/uts/common/io/mxfe/mxfe.c b/usr/src/uts/common/io/mxfe/mxfe.c
index 9470ac6b6b..044274acbf 100644
--- a/usr/src/uts/common/io/mxfe/mxfe.c
+++ b/usr/src/uts/common/io/mxfe/mxfe.c
@@ -177,7 +177,6 @@ static mac_callbacks_t mxfe_m_callbacks = {
 	mxfe_m_multicst,
 	mxfe_m_unicst,
 	mxfe_m_tx,
-	NULL,		/* mc_resources */
 	NULL,		/* mc_ioctl */
 	NULL,		/* mc_getcapab */
 	NULL,		/* mc_open */
diff --git a/usr/src/uts/common/io/mxfe/mxfeimpl.h b/usr/src/uts/common/io/mxfe/mxfeimpl.h
index c1bc8ab265..d5742eeceb 100644
--- a/usr/src/uts/common/io/mxfe/mxfeimpl.h
+++ b/usr/src/uts/common/io/mxfe/mxfeimpl.h
@@ -36,14 +36,14 @@
 #ifndef	_MXFEIMPL_H
 #define	_MXFEIMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This entire file is private to the MXFE driver.
  */
 
 #ifdef	_KERNEL
 
+#include	<sys/mac_provider.h>
+
 /*
  * Compile time tunables.
  */
diff --git a/usr/src/uts/common/io/net80211/net80211.c b/usr/src/uts/common/io/net80211/net80211.c
index 4b74943c85..fd49066fcc 100644
--- a/usr/src/uts/common/io/net80211/net80211.c
+++ b/usr/src/uts/common/io/net80211/net80211.c
@@ -35,8 +35,6 @@
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * IEEE 802.11 generic handler
  */
@@ -47,6 +45,7 @@
 #include <sys/modctl.h>
 #include <sys/stropts.h>
 #include <sys/door.h>
+#include <sys/mac_provider.h>
 #include "net80211_impl.h"
 
 uint32_t ieee80211_debug = 0x0;	/* debug msg flags */
diff --git a/usr/src/uts/common/io/net80211/net80211_input.c b/usr/src/uts/common/io/net80211/net80211_input.c
index ca948788d0..eb95149ea6 100644
--- a/usr/src/uts/common/io/net80211/net80211_input.c
+++ b/usr/src/uts/common/io/net80211/net80211_input.c
@@ -39,6 +39,7 @@
  * Process received frame
  */
 
+#include <sys/mac_provider.h>
 #include <sys/byteorder.h>
 #include <sys/strsun.h>
 #include "net80211_impl.h"
diff --git a/usr/src/uts/common/io/net80211/net80211_ioctl.c b/usr/src/uts/common/io/net80211/net80211_ioctl.c
index 8e905971ff..44935e0979 100644
--- a/usr/src/uts/common/io/net80211/net80211_ioctl.c
+++ b/usr/src/uts/common/io/net80211/net80211_ioctl.c
@@ -41,7 +41,7 @@
 #include <inet/nd.h>
 #include <inet/mi.h>
 #include <sys/note.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <inet/wifi_ioctl.h>
 #include "net80211_impl.h"
 
diff --git a/usr/src/uts/common/io/nge/nge.h b/usr/src/uts/common/io/nge/nge.h
index 430df8b83b..2944c6b820 100644
--- a/usr/src/uts/common/io/nge/nge.h
+++ b/usr/src/uts/common/io/nge/nge.h
@@ -61,7 +61,7 @@ extern "C" {
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 
 /*
diff --git a/usr/src/uts/common/io/nge/nge_main.c b/usr/src/uts/common/io/nge/nge_main.c
index 7ea4165779..f7b22f86e6 100644
--- a/usr/src/uts/common/io/nge/nge_main.c
+++ b/usr/src/uts/common/io/nge/nge_main.c
@@ -196,7 +196,6 @@ static mac_callbacks_t nge_m_callbacks = {
 	nge_m_multicst,
 	nge_m_unicst,
 	nge_m_tx,
-	NULL,
 	nge_m_ioctl,
 	nge_m_getcapab,
 	NULL,
@@ -2137,12 +2136,6 @@ nge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 			return (B_FALSE);
 		break;
 	}
-	case MAC_CAPAB_POLL:
-		/*
-		 * There's nothing for us to fill in, simply returning
-		 * B_TRUE, stating that we support polling is sufficient.
-		 */
-		break;
 	default:
 		return (B_FALSE);
 	}
diff --git a/usr/src/uts/common/io/ntxn/unm_nic.h b/usr/src/uts/common/io/ntxn/unm_nic.h
index 6c8232757f..e23c385ce5 100644
--- a/usr/src/uts/common/io/ntxn/unm_nic.h
+++ b/usr/src/uts/common/io/ntxn/unm_nic.h
@@ -54,7 +54,7 @@
 #include <inet/mi.h>
 #include <inet/nd.h>
 
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/miiregs.h> /* by fjlite out of intel */
 
diff --git a/usr/src/uts/common/io/ntxn/unm_nic_main.c b/usr/src/uts/common/io/ntxn/unm_nic_main.c
index b7e0c5832d..3db781fc8f 100644
--- a/usr/src/uts/common/io/ntxn/unm_nic_main.c
+++ b/usr/src/uts/common/io/ntxn/unm_nic_main.c
@@ -2513,9 +2513,6 @@ ntxn_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 			    HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM);
 		}
 		break;
-
-	case MAC_CAPAB_POLL:
-	case MAC_CAPAB_MULTIADDRESS:
 	default:
 		return (B_FALSE);
 	}
@@ -2534,7 +2531,6 @@ static mac_callbacks_t ntxn_m_callbacks = {
 	ntxn_m_multicst,
 	ntxn_m_unicst,
 	ntxn_m_tx,
-	NULL,			/* mc_resources */
 	ntxn_m_ioctl,
 	ntxn_m_getcapab,
 	NULL,			/* mc_open */
diff --git a/usr/src/uts/common/io/nxge/nxge_fzc.c b/usr/src/uts/common/io/nxge/nxge_fzc.c
index 91b5712895..3831d77eed 100644
--- a/usr/src/uts/common/io/nxge/nxge_fzc.c
+++ b/usr/src/uts/common/io/nxge/nxge_fzc.c
@@ -942,15 +942,18 @@ nxge_fzc_rdc_tbl_unbind(p_nxge_t nxge, int rdc_tbl)
 	NXGE_DEBUG_MSG((nxge, DMA_CTL, "==> nxge_fzc_rdc_tbl_unbind(%d)",
 	    rdc_tbl));
 
+	MUTEX_ENTER(&nhd->lock);
 	table = &nhd->rdc_tbl[rdc_tbl];
 	if (table->nxge != (uintptr_t)nxge) {
 		NXGE_ERROR_MSG((nxge, DMA_CTL,
 		    "nxge_fzc_rdc_tbl_unbind(%d): func%d not owner",
 		    nxge->function_num, rdc_tbl));
+		MUTEX_EXIT(&nhd->lock);
 		return (EINVAL);
 	} else {
 		bzero(table, sizeof (*table));
 	}
+	MUTEX_EXIT(&nhd->lock);
 
 	NXGE_DEBUG_MSG((nxge, DMA_CTL, "<== nxge_fzc_rdc_tbl_unbind(%d)",
 	    rdc_tbl));
diff --git a/usr/src/uts/common/io/nxge/nxge_hcall.s b/usr/src/uts/common/io/nxge/nxge_hcall.s
index c9f82b52df..56c85945b5 100644
--- a/usr/src/uts/common/io/nxge/nxge_hcall.s
+++ b/usr/src/uts/common/io/nxge/nxge_hcall.s
@@ -24,8 +24,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Hypervisor calls called by niu leaf driver.
  */
@@ -34,6 +32,8 @@
 #include <sys/hypervisor_api.h>
 #include <sys/nxge/nxge_impl.h>
 
+#if defined(sun4v)
+
 /*
  * NIU HV API v1.0 definitions
  */
@@ -518,3 +518,5 @@ hv_niu_vrrx_set_ino(uint32_t cookie, uint64_t vridx, uint32_t ino)
 	SET_SIZE(hv_niu_vrtx_param_set)
 
 #endif	/* lint || __lint */
+
+#endif /*defined(sun4v)*/
diff --git a/usr/src/uts/common/io/nxge/nxge_hio.c b/usr/src/uts/common/io/nxge/nxge_hio.c
index f4aa20706d..2b9a972fec 100644
--- a/usr/src/uts/common/io/nxge/nxge_hio.c
+++ b/usr/src/uts/common/io/nxge/nxge_hio.c
@@ -34,6 +34,7 @@
  *
  */
 
+#include <sys/mac_provider.h>
 #include <sys/nxge/nxge_impl.h>
 #include <sys/nxge/nxge_fzc.h>
 #include <sys/nxge/nxge_rxdma.h>
@@ -49,7 +50,9 @@
 extern npi_status_t npi_rxdma_dump_rdc_table(npi_handle_t, uint8_t);
 
 /* The following function may be found in nxge_main.c */
-extern int nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot);
+extern int nxge_m_mmac_remove(void *arg, int slot);
+extern int nxge_m_mmac_add_g(void *arg, const uint8_t *maddr, int rdctbl,
+	boolean_t usetbl);
 
 /* The following function may be found in nxge_[t|r]xdma.c */
 extern npi_status_t nxge_txdma_channel_disable(nxge_t *, int);
@@ -129,6 +132,7 @@ int
 nxge_hio_init(nxge_t *nxge)
 {
 	nxge_hio_data_t *nhd;
+	int i;
 
 	nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
 	if (nhd == 0) {
@@ -137,6 +141,31 @@ nxge_hio_init(nxge_t *nxge)
 		nxge->nxge_hw_p->hio = (uintptr_t)nhd;
 	}
 
+	/*
+	 * Initialize share and ring group structures.
+	 */
+	for (i = 0; i < NXGE_MAX_TDCS; i++)
+		nxge->tdc_is_shared[i] = B_FALSE;
+
+	for (i = 0; i < NXGE_MAX_TDC_GROUPS; i++) {
+		nxge->tx_hio_groups[i].ghandle = NULL;
+		nxge->tx_hio_groups[i].nxgep = nxge;
+		nxge->tx_hio_groups[i].type = MAC_RING_TYPE_TX;
+		nxge->tx_hio_groups[i].gindex = 0;
+		nxge->tx_hio_groups[i].sindex = 0;
+	}
+
+	for (i = 0; i < NXGE_MAX_RDC_GROUPS; i++) {
+		nxge->rx_hio_groups[i].ghandle = NULL;
+		nxge->rx_hio_groups[i].nxgep = nxge;
+		nxge->rx_hio_groups[i].type = MAC_RING_TYPE_RX;
+		nxge->rx_hio_groups[i].gindex = 0;
+		nxge->rx_hio_groups[i].sindex = 0;
+		nxge->rx_hio_groups[i].started = B_FALSE;
+		nxge->rx_hio_groups[i].rdctbl = -1;
+		nxge->rx_hio_groups[i].n_mac_addrs = 0;
+	}
+
 	nhd->hio.ldoms = B_FALSE;
 
 	return (NXGE_OK);
@@ -400,7 +429,7 @@ nxge_grp_dc_add(
 
 	NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_grp_dc_add"));
 
-	if (group == NULL)
+	if (group == 0)
 		return (0);
 
 	switch (type) {
@@ -424,7 +453,6 @@ nxge_grp_dc_add(
 	default:
 		NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
 		    "nxge_grp_dc_add: unknown type channel(%d)", channel));
-		return (NXGE_ERROR);
 	}
 
 	NXGE_DEBUG_MSG((nxge, HIO_CTL,
@@ -540,9 +568,6 @@ nxge_grp_dc_remove(
 	MUTEX_ENTER(&nhd->lock);
 
 	set = dc->type == VP_BOUND_TX ? &nxge->tx_set : &nxge->rx_set;
-	if (isLDOMs(nxge) && ((1 << channel) && set->shared.map)) {
-		NXGE_DC_RESET(group->map, channel);
-	}
 
 	/* Remove the DC from its group. */
 	if (nxge_grp_dc_unlink(nxge, group, channel) != dc) {
@@ -663,7 +688,10 @@ nxge_grp_dc_append(
  *	Any domain
  */
 nxge_hio_dc_t *
-nxge_grp_dc_unlink(nxge_t *nxge, nxge_grp_t *group, int channel)
+nxge_grp_dc_unlink(
+	nxge_t *nxge,
+	nxge_grp_t *group,
+	int channel)
 {
 	nxge_hio_dc_t *current, *previous;
 
@@ -699,6 +727,7 @@ nxge_grp_dc_unlink(nxge_t *nxge, nxge_grp_t *group, int channel)
 		current->next = 0;
 		current->group = 0;
 
+		NXGE_DC_RESET(group->map, channel);
 		group->count--;
 	}
 
@@ -914,15 +943,14 @@ nxge_ddi_perror(
  * Local prototypes
  */
 static nxge_hio_vr_t *nxge_hio_vr_share(nxge_t *);
-
-static int nxge_hio_dc_share(nxge_t *, nxge_hio_vr_t *, mac_ring_type_t);
 static void nxge_hio_unshare(nxge_hio_vr_t *);
 
-static int nxge_hio_addres(nxge_hio_vr_t *, mac_ring_type_t, int);
+static int nxge_hio_addres(nxge_hio_vr_t *, mac_ring_type_t, uint64_t *);
 static void nxge_hio_remres(nxge_hio_vr_t *, mac_ring_type_t, res_map_t);
 
-static void nxge_hio_tdc_unshare(nxge_t *nxge, int channel);
-static void nxge_hio_rdc_unshare(nxge_t *nxge, int channel);
+static void nxge_hio_tdc_unshare(nxge_t *nxge, int dev_grpid, int channel);
+static void nxge_hio_rdc_unshare(nxge_t *nxge, int dev_grpid, int channel);
+static int nxge_hio_dc_share(nxge_t *, nxge_hio_vr_t *, mac_ring_type_t, int);
 static void nxge_hio_dc_unshare(nxge_t *, nxge_hio_vr_t *,
     mac_ring_type_t, int);
 
@@ -967,6 +995,28 @@ nxge_hio_init(
 		}
 	}
 
+	/*
+	 * Initialize share and ring group structures.
+	 */
+	for (i = 0; i < NXGE_MAX_TDC_GROUPS; i++) {
+		nxge->tx_hio_groups[i].ghandle = NULL;
+		nxge->tx_hio_groups[i].nxgep = nxge;
+		nxge->tx_hio_groups[i].type = MAC_RING_TYPE_TX;
+		nxge->tx_hio_groups[i].gindex = 0;
+		nxge->tx_hio_groups[i].sindex = 0;
+	}
+
+	for (i = 0; i < NXGE_MAX_RDC_GROUPS; i++) {
+		nxge->rx_hio_groups[i].ghandle = NULL;
+		nxge->rx_hio_groups[i].nxgep = nxge;
+		nxge->rx_hio_groups[i].type = MAC_RING_TYPE_RX;
+		nxge->rx_hio_groups[i].gindex = 0;
+		nxge->rx_hio_groups[i].sindex = 0;
+		nxge->rx_hio_groups[i].started = B_FALSE;
+		nxge->rx_hio_groups[i].rdctbl = -1;
+		nxge->rx_hio_groups[i].n_mac_addrs = 0;
+	}
+
 	if (!isLDOMs(nxge)) {
 		nhd->hio.ldoms = B_FALSE;
 		return (NXGE_OK);
@@ -983,22 +1033,15 @@ nxge_hio_init(
 	nhd->vrs = NXGE_VR_SR_MAX - 2;
 
 	/*
-	 * Initialize tdc share state, shares and ring group structures.
+	 * Initialize the share stuctures.
 	 */
 	for (i = 0; i < NXGE_MAX_TDCS; i++)
 		nxge->tdc_is_shared[i] = B_FALSE;
 
-	for (i = 0; i < NXGE_MAX_RDC_GROUPS; i++) {
-		nxge->rx_hio_groups[i].ghandle = NULL;
-		nxge->rx_hio_groups[i].nxgep = nxge;
-		nxge->rx_hio_groups[i].gindex = 0;
-		nxge->rx_hio_groups[i].sindex = 0;
-	}
-
 	for (i = 0; i < NXGE_VR_SR_MAX; i++) {
 		nxge->shares[i].nxgep = nxge;
 		nxge->shares[i].index = 0;
-		nxge->shares[i].vrp = (void *)NULL;
+		nxge->shares[i].vrp = NULL;
 		nxge->shares[i].tmap = 0;
 		nxge->shares[i].rmap = 0;
 		nxge->shares[i].rxgroup = 0;
@@ -1033,77 +1076,251 @@ nxge_hio_init(
 
 	return (0);
 }
+#endif /* defined(sun4v) */
+
+static int
+nxge_hio_group_mac_add(nxge_t *nxge, nxge_ring_group_t *g,
+    const uint8_t *macaddr)
+{
+	int rv;
+	nxge_rdc_grp_t *group;
+
+	mutex_enter(nxge->genlock);
+
+	/*
+	 * Initialize the NXGE RDC table data structure.
+	 */
+	group = &nxge->pt_config.rdc_grps[g->rdctbl];
+	if (!group->flag) {
+		group->port = NXGE_GET_PORT_NUM(nxge->function_num);
+		group->config_method = RDC_TABLE_ENTRY_METHOD_REP;
+		group->flag = B_TRUE;	/* This group has been configured. */
+	}
+
+	mutex_exit(nxge->genlock);
+
+	/*
+	 * Add the MAC address.
+	 */
+	if ((rv = nxge_m_mmac_add_g((void *)nxge, macaddr,
+	    g->rdctbl, B_TRUE)) != 0) {
+		return (rv);
+	}
+
+	mutex_enter(nxge->genlock);
+	g->n_mac_addrs++;
+	mutex_exit(nxge->genlock);
+	return (0);
+}
 
 static int
 nxge_hio_add_mac(void *arg, const uint8_t *mac_addr)
 {
-	nxge_rx_ring_group_t *rxgroup = (nxge_rx_ring_group_t *)arg;
-	p_nxge_t nxge = rxgroup->nxgep;
-	int group = rxgroup->gindex;
-	int rv, sindex;
+	nxge_ring_group_t *group = (nxge_ring_group_t *)arg;
+	p_nxge_t nxge = group->nxgep;
+	int rv;
 	nxge_hio_vr_t *vr;	/* The Virtualization Region */
 
-	sindex = nxge->rx_hio_groups[group].sindex;
-	vr = (nxge_hio_vr_t *)nxge->shares[sindex].vrp;
+	ASSERT(group->type == MAC_RING_TYPE_RX);
+
+	mutex_enter(nxge->genlock);
 
 	/*
-	 * Program the mac address for the group/share.
+	 * If the group is associated with a VR, then only one
+	 * address may be assigned to the group.
 	 */
-	if ((rv = nxge_hio_hostinfo_init(nxge, vr,
-	    (ether_addr_t *)mac_addr)) != 0) {
+	vr = (nxge_hio_vr_t *)nxge->shares[group->sindex].vrp;
+	if ((vr != NULL) && (group->n_mac_addrs)) {
+		mutex_exit(nxge->genlock);
+		return (ENOSPC);
+	}
+
+	mutex_exit(nxge->genlock);
+
+	/*
+	 * Program the mac address for the group.
+	 */
+	if ((rv = nxge_hio_group_mac_add(nxge, group,
+	    mac_addr)) != 0) {
 		return (rv);
 	}
 
 	return (0);
 }
 
+static int
+find_mac_slot(nxge_mmac_t *mmac_info, const uint8_t *mac_addr)
+{
+	int i;
+	for (i = 0; i <= mmac_info->num_mmac; i++) {
+		if (memcmp(mmac_info->mac_pool[i].addr, mac_addr,
+		    ETHERADDRL) == 0) {
+			return (i);
+		}
+	}
+	return (-1);
+}
+
 /* ARGSUSED */
 static int
 nxge_hio_rem_mac(void *arg, const uint8_t *mac_addr)
 {
-	nxge_rx_ring_group_t *rxgroup = (nxge_rx_ring_group_t *)arg;
-	p_nxge_t nxge = rxgroup->nxgep;
-	int group = rxgroup->gindex;
-	int sindex;
-	nxge_hio_vr_t *vr;	/* The Virtualization Region */
+	nxge_ring_group_t *group = (nxge_ring_group_t *)arg;
+	p_nxge_t nxge = group->nxgep;
+	nxge_mmac_t *mmac_info;
+	int rv, slot;
+
+	ASSERT(group->type == MAC_RING_TYPE_RX);
+
+	mutex_enter(nxge->genlock);
+
+	mmac_info = &nxge->nxge_mmac_info;
+	slot = find_mac_slot(mmac_info, mac_addr);
+	if (slot < 0) {
+		mutex_exit(nxge->genlock);
+		return (EINVAL);
+	}
+
+	mutex_exit(nxge->genlock);
+
+	/*
+	 * Remove the mac address for the group
+	 */
+	if ((rv = nxge_m_mmac_remove(nxge, slot)) != 0) {
+		return (rv);
+	}
+
+	mutex_enter(nxge->genlock);
+	group->n_mac_addrs--;
+	mutex_exit(nxge->genlock);
+
+	return (0);
+}
 
-	sindex = nxge->rx_hio_groups[group].sindex;
-	vr = (nxge_hio_vr_t *)nxge->shares[sindex].vrp;
+static int
+nxge_hio_group_start(mac_group_driver_t gdriver)
+{
+	nxge_ring_group_t	*group = (nxge_ring_group_t *)gdriver;
+	int			rdctbl;
+	int			dev_gindex;
+
+	ASSERT(group->type == MAC_RING_TYPE_RX);
+
+#ifdef later
+	ASSERT(group->nxgep->nxge_mac_state == NXGE_MAC_STARTED);
+#endif
+	if (group->nxgep->nxge_mac_state != NXGE_MAC_STARTED)
+		return (ENXIO);
+
+	mutex_enter(group->nxgep->genlock);
+	dev_gindex = group->nxgep->pt_config.hw_config.def_mac_rxdma_grpid +
+	    group->gindex;
 
 	/*
-	 * Remove the mac address for the group/share.
+	 * Get an rdc table for this group.
+	 * Group ID is given by the caller, and that's the group it needs
+	 * to bind to.  The default group is already bound when the driver
+	 * was attached.
+	 *
+	 * For Group 0, it's RDC table was allocated at attach time
+	 * no need to allocate a new table.
 	 */
-	nxge_hio_hostinfo_uninit(nxge, vr);
+	if (group->gindex != 0) {
+		rdctbl = nxge_fzc_rdc_tbl_bind(group->nxgep,
+		    dev_gindex, B_TRUE);
+		if (rdctbl < 0) {
+			mutex_exit(group->nxgep->genlock);
+			return (rdctbl);
+		}
+	} else {
+		rdctbl = group->nxgep->pt_config.hw_config.def_mac_rxdma_grpid;
+	}
+
+	group->rdctbl = rdctbl;
+
+	(void) nxge_init_fzc_rdc_tbl(group->nxgep, rdctbl);
+
+	group->started = B_TRUE;
+	mutex_exit(group->nxgep->genlock);
 
 	return (0);
 }
 
+static void
+nxge_hio_group_stop(mac_group_driver_t gdriver)
+{
+	nxge_ring_group_t *group = (nxge_ring_group_t *)gdriver;
+
+	ASSERT(group->type == MAC_RING_TYPE_RX);
+
+	mutex_enter(group->nxgep->genlock);
+	group->started = B_FALSE;
+
+	/*
+	 * Unbind the RDC table previously bound for this group.
+	 *
+	 * Since RDC table for group 0 was allocated at attach
+	 * time, no need to unbind the table here.
+	 */
+	if (group->gindex != 0)
+		(void) nxge_fzc_rdc_tbl_unbind(group->nxgep, group->rdctbl);
+
+	mutex_exit(group->nxgep->genlock);
+}
+
 /* ARGSUSED */
 void
-nxge_hio_group_get(void *arg, mac_ring_type_t type, int group,
+nxge_hio_group_get(void *arg, mac_ring_type_t type, int groupid,
 	mac_group_info_t *infop, mac_group_handle_t ghdl)
 {
-	p_nxge_t nxgep = (p_nxge_t)arg;
-	nxge_rx_ring_group_t *rxgroup;
+	p_nxge_t		nxgep = (p_nxge_t)arg;
+	nxge_ring_group_t	*group;
+	int			dev_gindex;
 
 	switch (type) {
 	case MAC_RING_TYPE_RX:
-		rxgroup = &nxgep->rx_hio_groups[group];
-		rxgroup->gindex = group;
-
-		infop->mrg_driver = (mac_group_driver_t)rxgroup;
-		infop->mrg_start = NULL;
-		infop->mrg_stop = NULL;
-		infop->mrg_addmac = nxge_hio_add_mac;
-		infop->mrg_remmac = nxge_hio_rem_mac;
-		infop->mrg_count = NXGE_HIO_SHARE_MAX_CHANNELS;
+		group = &nxgep->rx_hio_groups[groupid];
+		group->nxgep = nxgep;
+		group->ghandle = ghdl;
+		group->gindex = groupid;
+		group->sindex = 0;	/* not yet bound to a share */
+
+		dev_gindex = nxgep->pt_config.hw_config.def_mac_rxdma_grpid +
+		    groupid;
+
+		infop->mgi_driver = (mac_group_driver_t)group;
+		infop->mgi_start = nxge_hio_group_start;
+		infop->mgi_stop = nxge_hio_group_stop;
+		infop->mgi_addmac = nxge_hio_add_mac;
+		infop->mgi_remmac = nxge_hio_rem_mac;
+		infop->mgi_count =
+		    nxgep->pt_config.rdc_grps[dev_gindex].max_rdcs;
 		break;
 
 	case MAC_RING_TYPE_TX:
+		/*
+		 * 'groupid' for TX should be incremented by one since
+		 * the default group (groupid 0) is not known by the MAC layer
+		 */
+		group = &nxgep->tx_hio_groups[groupid + 1];
+		group->nxgep = nxgep;
+		group->ghandle = ghdl;
+		group->gindex = groupid + 1;
+		group->sindex = 0;	/* not yet bound to a share */
+
+		infop->mgi_driver = (mac_group_driver_t)group;
+		infop->mgi_start = NULL;
+		infop->mgi_stop = NULL;
+		infop->mgi_addmac = NULL;	/* not needed */
+		infop->mgi_remmac = NULL;	/* not needed */
+		/* no rings associated with group initially */
+		infop->mgi_count = 0;
 		break;
 	}
 }
 
+#if defined(sun4v)
+
 int
 nxge_hio_share_assign(
 	nxge_t *nxge,
@@ -1126,7 +1343,6 @@ nxge_hio_share_assign(
 		NXGE_ERROR_MSG((nxge, HIO_CTL,
 		    "nxge_hio_share_assign: "
 		    "vr->assign() returned %d", hv_rv));
-		nxge_hio_unshare(vr);
 		return (-EIO);
 	}
 
@@ -1189,7 +1405,7 @@ nxge_hio_share_assign(
 	return (0);
 }
 
-int
+void
 nxge_hio_share_unassign(
 	nxge_hio_vr_t *vr)
 {
@@ -1237,23 +1453,15 @@ nxge_hio_share_unassign(
 			    vr->cookie, hv_rv));
 		}
 	}
-
-	return (0);
 }
 
 int
-nxge_hio_share_alloc(void *arg, uint64_t cookie, uint64_t *rcookie,
-	mac_share_handle_t *shandle)
+nxge_hio_share_alloc(void *arg, mac_share_handle_t *shandle)
 {
-	p_nxge_t nxge = (p_nxge_t)arg;
-	nxge_rx_ring_group_t *rxgroup;
-	nxge_share_handle_t *shp;
-
-	nxge_hio_vr_t *vr;	/* The Virtualization Region */
-	uint64_t rmap, tmap;
-	int rdctbl, rv;
-
-	nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
+	p_nxge_t		nxge = (p_nxge_t)arg;
+	nxge_share_handle_t	*shp;
+	nxge_hio_vr_t		*vr;	/* The Virtualization Region */
+	nxge_hio_data_t		*nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
 
 	NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_share"));
 
@@ -1269,65 +1477,257 @@ nxge_hio_share_alloc(void *arg, uint64_t cookie, uint64_t *rcookie,
 	if ((vr = nxge_hio_vr_share(nxge)) == 0)
 		return (EAGAIN);
 
+	shp = &nxge->shares[vr->region];
+	shp->nxgep = nxge;
+	shp->index = vr->region;
+	shp->vrp = (void *)vr;
+	shp->tmap = shp->rmap = 0;	/* to be assigned by ms_sbind */
+	shp->rxgroup = 0;		/* to be assigned by ms_sadd */
+	shp->active = B_FALSE;		/* not bound yet */
+
+	*shandle = (mac_share_handle_t)shp;
+
+	NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_share"));
+	return (0);
+}
+
+
+void
+nxge_hio_share_free(mac_share_handle_t shandle)
+{
+	nxge_share_handle_t	*shp = (nxge_share_handle_t *)shandle;
+	nxge_hio_vr_t		*vr;
+
 	/*
-	 * Get an RDC group for us to use.
+	 * Clear internal handle state.
 	 */
-	if ((rdctbl = nxge_hio_hostinfo_get_rdc_table(nxge)) < 0) {
-		nxge_hio_unshare(vr);
-		return (EBUSY);
+	vr = shp->vrp;
+	shp->vrp = (void *)NULL;
+	shp->index = 0;
+	shp->tmap = 0;
+	shp->rmap = 0;
+	shp->rxgroup = 0;
+	shp->active = B_FALSE;
+
+	/*
+	 * Free VR resource.
+	 */
+	nxge_hio_unshare(vr);
+}
+
+
+void
+nxge_hio_share_query(mac_share_handle_t shandle, mac_ring_type_t type,
+    mac_ring_handle_t *rings, uint_t *n_rings)
+{
+	nxge_t			*nxge;
+	nxge_share_handle_t	*shp = (nxge_share_handle_t *)shandle;
+	nxge_ring_handle_t	*rh;
+	uint32_t		offset;
+
+	nxge = shp->nxgep;
+
+	switch (type) {
+	case MAC_RING_TYPE_RX:
+		rh = nxge->rx_ring_handles;
+		offset = nxge->pt_config.hw_config.start_rdc;
+		break;
+
+	case MAC_RING_TYPE_TX:
+		rh = nxge->tx_ring_handles;
+		offset = nxge->pt_config.hw_config.tdc.start;
+		break;
 	}
-	vr->rdc_tbl = (uint8_t)rdctbl;
+
+	/*
+	 * In version 1.0, we may only give a VR 2 RDCs/TDCs.  Not only that,
+	 * but the HV has statically assigned the channels like so:
+	 * VR0: RDC0 & RDC1
+	 * VR1: RDC2 & RDC3, etc.
+	 * The TDCs are assigned in exactly the same way.
+	 */
+	if (rings != NULL) {
+		rings[0] = rh[(shp->index * 2) - offset].ring_handle;
+		rings[1] = rh[(shp->index * 2 + 1) - offset].ring_handle;
+	}
+	if (n_rings != NULL) {
+		*n_rings = 2;
+	}
+}
+
+int
+nxge_hio_share_add_group(mac_share_handle_t shandle,
+    mac_group_driver_t ghandle)
+{
+	nxge_t			*nxge;
+	nxge_share_handle_t	*shp = (nxge_share_handle_t *)shandle;
+	nxge_ring_group_t	*rg = (nxge_ring_group_t *)ghandle;
+	nxge_hio_vr_t		*vr;	/* The Virtualization Region */
+	nxge_grp_t		*group;
+	int			i;
+
+	if (rg->sindex != 0) {
+		/* the group is already bound to a share */
+		return (EALREADY);
+	}
+
+	nxge = rg->nxgep;
+	vr = shp->vrp;
+
+	switch (rg->type) {
+	case MAC_RING_TYPE_RX:
+		/*
+		 * Make sure that the group has the right rings associated
+		 * for the share. In version 1.0, we may only give a VR
+		 * 2 RDCs.  Not only that, but the HV has statically
+		 * assigned the channels like so:
+		 * VR0: RDC0 & RDC1
+		 * VR1: RDC2 & RDC3, etc.
+		 */
+		group = nxge->rx_set.group[rg->gindex];
+
+		if (group->count > 2) {
+			/* a share can have at most 2 rings */
+			return (EINVAL);
+		}
+
+		for (i = 0; i < NXGE_MAX_RDCS; i++) {
+			if (group->map & (1 << i)) {
+				if ((i != shp->index * 2) &&
+				    (i != (shp->index * 2 + 1))) {
+					/*
+					 * A group with invalid rings was
+					 * attempted to bind to this share
+					 */
+					return (EINVAL);
+				}
+			}
+		}
+
+		rg->sindex = vr->region;
+		vr->rdc_tbl = rg->rdctbl;
+		shp->rxgroup = vr->rdc_tbl;
+		break;
+
+	case MAC_RING_TYPE_TX:
+		/*
+		 * Make sure that the group has the right rings associated
+		 * for the share. In version 1.0, we may only give a VR
+		 * 2 TDCs.  Not only that, but the HV has statically
+		 * assigned the channels like so:
+		 * VR0: TDC0 & TDC1
+		 * VR1: TDC2 & TDC3, etc.
+		 */
+		group = nxge->tx_set.group[rg->gindex];
+
+		if (group->count > 2) {
+			/* a share can have at most 2 rings */
+			return (EINVAL);
+		}
+
+		for (i = 0; i < NXGE_MAX_TDCS; i++) {
+			if (group->map & (1 << i)) {
+				if ((i != shp->index * 2) &&
+				    (i != (shp->index * 2 + 1))) {
+					/*
+					 * A group with invalid rings was
+					 * attempted to bind to this share
+					 */
+					return (EINVAL);
+				}
+			}
+		}
+
+		vr->tdc_tbl = nxge->pt_config.hw_config.def_mac_txdma_grpid +
+		    rg->gindex;
+		rg->sindex = vr->region;
+		break;
+	}
+	return (0);
+}
+
+int
+nxge_hio_share_rem_group(mac_share_handle_t shandle,
+    mac_group_driver_t ghandle)
+{
+	nxge_share_handle_t	*shp = (nxge_share_handle_t *)shandle;
+	nxge_ring_group_t	*group = (nxge_ring_group_t *)ghandle;
+	nxge_hio_vr_t		*vr;	/* The Virtualization Region */
+	int			rv = 0;
+
+	vr = shp->vrp;
+
+	switch (group->type) {
+	case MAC_RING_TYPE_RX:
+		group->sindex = 0;
+		vr->rdc_tbl = 0;
+		shp->rxgroup = 0;
+		break;
+
+	case MAC_RING_TYPE_TX:
+		group->sindex = 0;
+		vr->tdc_tbl = 0;
+		break;
+	}
+
+	return (rv);
+}
+
+int
+nxge_hio_share_bind(mac_share_handle_t shandle, uint64_t cookie,
+    uint64_t *rcookie)
+{
+	nxge_t			*nxge;
+	nxge_share_handle_t	*shp = (nxge_share_handle_t *)shandle;
+	nxge_hio_vr_t		*vr;
+	uint64_t		rmap, tmap, hv_rmap, hv_tmap;
+	int			rv;
+
+	nxge = shp->nxgep;
+	vr = (nxge_hio_vr_t *)shp->vrp;
 
 	/*
 	 * Add resources to the share.
+	 * For each DMA channel associated with the VR, bind its resources
+	 * to the VR.
 	 */
 	tmap = 0;
-	rv = nxge_hio_addres(vr, MAC_RING_TYPE_TX,
-	    NXGE_HIO_SHARE_MAX_CHANNELS);
+	rv = nxge_hio_addres(vr, MAC_RING_TYPE_TX, &tmap);
 	if (rv != 0) {
-		nxge_hio_unshare(vr);
 		return (rv);
 	}
 
 	rmap = 0;
-	rv = nxge_hio_addres(vr, MAC_RING_TYPE_RX,
-	    NXGE_HIO_SHARE_MAX_CHANNELS);
+	rv = nxge_hio_addres(vr, MAC_RING_TYPE_RX, &rmap);
 	if (rv != 0) {
 		nxge_hio_remres(vr, MAC_RING_TYPE_TX, tmap);
-		nxge_hio_unshare(vr);
 		return (rv);
 	}
 
-	if ((rv = nxge_hio_share_assign(nxge, cookie, &tmap, &rmap, vr))) {
-		nxge_hio_remres(vr, MAC_RING_TYPE_RX, tmap);
+	/*
+	 * Ask the Hypervisor to set up the VR and allocate slots for
+	 * each rings associated with the VR.
+	 */
+	hv_tmap = hv_rmap = 0;
+	if ((rv = nxge_hio_share_assign(nxge, cookie,
+	    &hv_tmap, &hv_rmap, vr))) {
 		nxge_hio_remres(vr, MAC_RING_TYPE_TX, tmap);
-		nxge_hio_unshare(vr);
+		nxge_hio_remres(vr, MAC_RING_TYPE_RX, rmap);
 		return (rv);
 	}
 
-	rxgroup = &nxge->rx_hio_groups[vr->rdc_tbl];
-	rxgroup->gindex = vr->rdc_tbl;
-	rxgroup->sindex = vr->region;
-
-	shp = &nxge->shares[vr->region];
-	shp->index = vr->region;
-	shp->vrp = (void *)vr;
-	shp->tmap = tmap;
-	shp->rmap = rmap;
-	shp->rxgroup = vr->rdc_tbl;
 	shp->active = B_TRUE;
+	shp->tmap = hv_tmap;
+	shp->rmap = hv_rmap;
 
 	/* high 32 bits are cfg_hdl and low 32 bits are HV cookie */
 	*rcookie = (((uint64_t)nxge->niu_cfg_hdl) << 32) | vr->cookie;
 
-	*shandle = (mac_share_handle_t)shp;
-
-	NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_share"));
 	return (0);
 }
 
 void
-nxge_hio_share_free(mac_share_handle_t shandle)
+nxge_hio_share_unbind(mac_share_handle_t shandle)
 {
 	nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle;
 
@@ -1335,52 +1735,15 @@ nxge_hio_share_free(mac_share_handle_t shandle)
 	 * First, unassign the VR (take it back),
 	 * so we can enable interrupts again.
 	 */
-	(void) nxge_hio_share_unassign(shp->vrp);
+	nxge_hio_share_unassign(shp->vrp);
 
 	/*
 	 * Free Ring Resources for TX and RX
 	 */
 	nxge_hio_remres(shp->vrp, MAC_RING_TYPE_TX, shp->tmap);
 	nxge_hio_remres(shp->vrp, MAC_RING_TYPE_RX, shp->rmap);
-
-	/*
-	 * Free VR resource.
-	 */
-	nxge_hio_unshare(shp->vrp);
-
-	/*
-	 * Clear internal handle state.
-	 */
-	shp->index = 0;
-	shp->vrp = (void *)NULL;
-	shp->tmap = 0;
-	shp->rmap = 0;
-	shp->rxgroup = 0;
-	shp->active = B_FALSE;
 }
 
-void
-nxge_hio_share_query(mac_share_handle_t shandle, mac_ring_type_t type,
-	uint32_t *rmin, uint32_t *rmax, uint64_t *rmap, uint64_t *gnum)
-{
-	nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle;
-
-	switch (type) {
-	case MAC_RING_TYPE_RX:
-		*rmin = NXGE_HIO_SHARE_MIN_CHANNELS;
-		*rmax = NXGE_HIO_SHARE_MAX_CHANNELS;
-		*rmap = shp->rmap;
-		*gnum = shp->rxgroup;
-		break;
-
-	case MAC_RING_TYPE_TX:
-		*rmin = NXGE_HIO_SHARE_MIN_CHANNELS;
-		*rmax = NXGE_HIO_SHARE_MAX_CHANNELS;
-		*rmap = shp->tmap;
-		*gnum = 0;
-		break;
-	}
-}
 
 /*
  * nxge_hio_vr_share
@@ -1474,7 +1837,11 @@ nxge_hio_unshare(
 	 *
 	 * nxge_hio_hostinfo_uninit(nxge, vr);
 	 */
-	(void) nxge_fzc_rdc_tbl_unbind(nxge, vr->rdc_tbl);
+
+	/*
+	 * XXX: This is done by ms_sremove?
+	 * (void) nxge_fzc_rdc_tbl_unbind(nxge, vr->rdc_tbl);
+	 */
 
 	nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
 
@@ -1495,23 +1862,53 @@ int
 nxge_hio_addres(
 	nxge_hio_vr_t *vr,
 	mac_ring_type_t type,
-	int count)
+	uint64_t *map)
 {
-	nxge_t *nxge = (nxge_t *)vr->nxge;
-	int i;
+	nxge_t		*nxge = (nxge_t *)vr->nxge;
+	nxge_grp_t	*group;
+	int		groupid;
+	int		i;
+	int		max_dcs;
 
 	NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_addres"));
 
 	if (!nxge)
 		return (EINVAL);
 
-	for (i = 0; i < count; i++) {
-		int rv;
-		if ((rv = nxge_hio_dc_share(nxge, vr, type)) < 0) {
-			if (i == 0) /* Couldn't get even one DC. */
-				return (-rv);
-			else
-				break;
+	/*
+	 * For each ring associated with the group, add the resources
+	 * to the group and bind.
+	 */
+	max_dcs = (type == MAC_RING_TYPE_TX) ? NXGE_MAX_TDCS : NXGE_MAX_RDCS;
+	if (type == MAC_RING_TYPE_TX) {
+		/* set->group is an array of group indexed by a port group id */
+		groupid = vr->tdc_tbl -
+		    nxge->pt_config.hw_config.def_mac_txdma_grpid;
+		group = nxge->tx_set.group[groupid];
+	} else {
+		/* set->group is an array of group indexed by a port group id */
+		groupid = vr->rdc_tbl -
+		    nxge->pt_config.hw_config.def_mac_rxdma_grpid;
+		group = nxge->rx_set.group[groupid];
+	}
+
+	if (group->map == 0) {
+		NXGE_DEBUG_MSG((nxge, HIO_CTL, "There is no rings associated "
+		    "with this VR"));
+		return (EINVAL);
+	}
+
+	for (i = 0; i < max_dcs; i++) {
+		if (group->map & (1 << i)) {
+			int rv;
+
+			if ((rv = nxge_hio_dc_share(nxge, vr, type, i)) < 0) {
+				if (*map == 0) /* Couldn't get even one DC. */
+					return (-rv);
+				else
+					break;
+			}
+			*map |= (1 << i);
 		}
 	}
 
@@ -1538,6 +1935,10 @@ nxge_hio_remres(
 
 	NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_remres(%lx)", res_map));
 
+	/*
+	 * For each ring bound to the group, remove the DMA resources
+	 * from the group and unbind.
+	 */
 	group = (type == MAC_RING_TYPE_TX ? &vr->tx_group : &vr->rx_group);
 	while (group->dc) {
 		nxge_hio_dc_t *dc = group->dc;
@@ -1628,12 +2029,11 @@ nxge_hio_tdc_share(
 	nxge->tdc_is_shared[channel] = B_TRUE;
 	MUTEX_EXIT(&nhd->lock);
 
-
 	if (nxge_intr_remove(nxge, VP_BOUND_TX, channel) != NXGE_OK) {
 		NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, "nxge_hio_tdc_share: "
 		    "Failed to remove interrupt for TxDMA channel %d",
 		    channel));
-		return (NXGE_ERROR);
+		return (-EINVAL);
 	}
 
 	/* Disable TxDMA A.9.6.10 */
@@ -1698,13 +2098,9 @@ nxge_hio_rdc_share(
 	nxge_hio_vr_t *vr,
 	int channel)
 {
-	nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
-	nxge_hw_pt_cfg_t *hardware = &nxge->pt_config.hw_config;
 	nxge_grp_set_t *set = &nxge->rx_set;
 	nxge_rdc_grp_t *rdc_grp;
 
-	int current, last;
-
 	NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_rdc_share"));
 
 	/* Disable interrupts. */
@@ -1739,21 +2135,6 @@ nxge_hio_rdc_share(
 	nxge_grp_dc_remove(nxge, VP_BOUND_RX, channel);
 
 	/*
-	 * We have to reconfigure the RDC table(s)
-	 * to which this channel belongs.
-	 */
-	current = hardware->def_mac_rxdma_grpid;
-	last = current + hardware->max_rdc_grpids;
-	for (; current < last; current++) {
-		if (nhd->rdc_tbl[current].nxge == (uintptr_t)nxge) {
-			rdc_grp = &nxge->pt_config.rdc_grps[current];
-			rdc_grp->map = set->owned.map;
-			rdc_grp->max_rdcs--;
-			(void) nxge_init_fzc_rdc_tbl(nxge, current);
-		}
-	}
-
-	/*
 	 * The guest domain will reconfigure the RDC later.
 	 *
 	 * But in the meantime, we must re-enable the Rx MAC so
@@ -1791,12 +2172,6 @@ nxge_hio_rdc_share(
 	}
 	NXGE_DC_SET(rdc_grp->map, channel);
 
-	if (nxge_init_fzc_rdc_tbl(nxge, vr->rdc_tbl) != NXGE_OK) {
-		NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
-		    "nxge_hio_rdc_share: nxge_init_fzc_rdc_tbl failed"));
-		return (-EIO);
-	}
-
 	NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_rdc_share"));
 
 	return (0);
@@ -1811,8 +2186,7 @@ nxge_hio_rdc_share(
  * 	nxge
  * 	vr	The VR that <channel> will belong to.
  * 	type	Tx or Rx.
- * 	res_map	The resource map used by the caller, which we will
- *		update if successful.
+ * 	channel	Channel to share
  *
  * Notes:
  *
@@ -1823,59 +2197,17 @@ int
 nxge_hio_dc_share(
 	nxge_t *nxge,
 	nxge_hio_vr_t *vr,
-	mac_ring_type_t type)
+	mac_ring_type_t type,
+	int channel)
 {
 	nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
-	nxge_hw_pt_cfg_t *hardware;
 	nxge_hio_dc_t *dc;
-	int channel, limit;
-
-	nxge_grp_set_t *set;
 	nxge_grp_t *group;
-
 	int slot;
 
 	NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_dc_share(%cdc %d",
 	    type == MAC_RING_TYPE_TX ? 't' : 'r', channel));
 
-	/*
-	 * In version 1.0, we may only give a VR 2 RDCs or TDCs.
-	 * Not only that, but the HV has statically assigned the
-	 * channels like so:
-	 * VR0: RDC0 & RDC1
-	 * VR1: RDC2 & RDC3, etc.
-	 * The TDCs are assigned in exactly the same way.
-	 *
-	 * So, for example
-	 *	hardware->start_rdc + vr->region * 2;
-	 *	VR1: hardware->start_rdc + 1 * 2;
-	 *	VR3: hardware->start_rdc + 3 * 2;
-	 *	If start_rdc is 0, we end up with 2 or 6.
-	 *	If start_rdc is 8, we end up with 10 or 14.
-	 */
-
-	set = (type == MAC_RING_TYPE_TX ? &nxge->tx_set : &nxge->rx_set);
-	hardware = &nxge->pt_config.hw_config;
-
-	// This code is still NIU-specific (assuming only 2 ports)
-	channel = hardware->start_rdc + (vr->region % 4) * 2;
-	limit = channel + 2;
-
-	MUTEX_ENTER(&nhd->lock);
-	for (; channel < limit; channel++) {
-		if ((1 << channel) & set->owned.map) {
-			break;
-		}
-	}
-
-	if (channel == limit) {
-		MUTEX_EXIT(&nhd->lock);
-		NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
-		    "nxge_hio_dc_share: there are no channels to share"));
-		return (-EIO);
-	}
-
-	MUTEX_EXIT(&nhd->lock);
 
 	/* -------------------------------------------------- */
 	slot = (type == MAC_RING_TYPE_TX) ?
@@ -1884,9 +2216,9 @@ nxge_hio_dc_share(
 
 	if (slot < 0) {
 		if (type == MAC_RING_TYPE_RX) {
-			nxge_hio_rdc_unshare(nxge, channel);
+			nxge_hio_rdc_unshare(nxge, vr->rdc_tbl, channel);
 		} else {
-			nxge_hio_tdc_unshare(nxge, channel);
+			nxge_hio_tdc_unshare(nxge, vr->tdc_tbl, channel);
 		}
 		return (slot);
 	}
@@ -1912,7 +2244,6 @@ nxge_hio_dc_share(
 	group = (type == MAC_RING_TYPE_TX ? &vr->tx_group : &vr->rx_group);
 
 	dc->group = group;
-
 	/* Initialize <group>, if necessary */
 	if (group->count == 0) {
 		group->nxge = nxge;
@@ -1952,16 +2283,21 @@ nxge_hio_dc_share(
 void
 nxge_hio_tdc_unshare(
 	nxge_t *nxge,
+	int dev_grpid,
 	int channel)
 {
 	nxge_grp_set_t *set = &nxge->tx_set;
-	nxge_grp_t *group = set->group[0];
+	nxge_grp_t *group;
+	int grpid;
 
 	NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_tdc_unshare"));
 
 	NXGE_DC_RESET(set->shared.map, channel);
 	set->shared.count--;
 
+	grpid = dev_grpid - nxge->pt_config.hw_config.def_mac_txdma_grpid;
+	group = set->group[grpid];
+
 	if ((nxge_grp_dc_add(nxge, group, VP_BOUND_TX, channel))) {
 		NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, "nxge_hio_tdc_unshare: "
 		    "Failed to initialize TxDMA channel %d", channel));
@@ -1994,14 +2330,12 @@ nxge_hio_tdc_unshare(
 void
 nxge_hio_rdc_unshare(
 	nxge_t *nxge,
+	int dev_grpid,
 	int channel)
 {
-	nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio;
-	nxge_hw_pt_cfg_t *hardware = &nxge->pt_config.hw_config;
-
-	nxge_grp_set_t *set = &nxge->rx_set;
-	nxge_grp_t *group = set->group[0];
-	int current, last;
+	nxge_grp_set_t		*set = &nxge->rx_set;
+	nxge_grp_t		*group;
+	int			grpid;
 
 	NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_rdc_unshare"));
 
@@ -2024,6 +2358,9 @@ nxge_hio_rdc_unshare(
 	NXGE_DC_RESET(set->shared.map, channel);
 	set->shared.count--;
 
+	grpid = dev_grpid - nxge->pt_config.hw_config.def_mac_rxdma_grpid;
+	group = set->group[grpid];
+
 	/*
 	 * Assert RST: RXDMA_CFIG1[30] = 1
 	 *
@@ -2035,7 +2372,7 @@ nxge_hio_rdc_unshare(
 		/* Be sure to re-enable the RX MAC. */
 		if (nxge_rx_mac_enable(nxge) != NXGE_OK) {
 			NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
-			    "nxge_hio_rdc_unshare: Rx MAC still disabled"));
+			    "nxge_hio_rdc_share: Rx MAC still disabled"));
 		}
 		NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, "nxge_hio_rdc_unshare: "
 		    "Failed to initialize RxDMA channel %d", channel));
@@ -2043,27 +2380,11 @@ nxge_hio_rdc_unshare(
 	}
 
 	/*
-	 * We have to reconfigure the RDC table(s)
-	 * to which this channel once again belongs.
-	 */
-	current = hardware->def_mac_rxdma_grpid;
-	last = current + hardware->max_rdc_grpids;
-	for (; current < last; current++) {
-		if (nhd->rdc_tbl[current].nxge == (uintptr_t)nxge) {
-			nxge_rdc_grp_t *group;
-			group = &nxge->pt_config.rdc_grps[current];
-			group->map = set->owned.map;
-			group->max_rdcs++;
-			(void) nxge_init_fzc_rdc_tbl(nxge, current);
-		}
-	}
-
-	/*
 	 * Enable RxMAC = A.9.2.10
 	 */
 	if (nxge_rx_mac_enable(nxge) != NXGE_OK) {
 		NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
-		    "nxge_hio_rdc_unshare: Rx MAC still disabled"));
+		    "nxge_hio_rdc_share: Rx MAC still disabled"));
 		return;
 	}
 
@@ -2120,9 +2441,9 @@ nxge_hio_dc_unshare(
 	dc->cookie = 0;
 
 	if (type == MAC_RING_TYPE_RX) {
-		nxge_hio_rdc_unshare(nxge, channel);
+		nxge_hio_rdc_unshare(nxge, vr->rdc_tbl, channel);
 	} else {
-		nxge_hio_tdc_unshare(nxge, channel);
+		nxge_hio_tdc_unshare(nxge, vr->tdc_tbl, channel);
 	}
 
 	NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_dc_unshare"));
diff --git a/usr/src/uts/common/io/nxge/nxge_hio_guest.c b/usr/src/uts/common/io/nxge/nxge_hio_guest.c
index 5fbcbfdfe1..5517b9ceee 100644
--- a/usr/src/uts/common/io/nxge/nxge_hio_guest.c
+++ b/usr/src/uts/common/io/nxge/nxge_hio_guest.c
@@ -208,7 +208,6 @@ static void nxge_check_guest_state(nxge_hio_vr_t *);
  *	Guest domain
  */
 /* ARGSUSED */
-
 int
 nxge_hio_vr_add(nxge_t *nxge)
 {
@@ -249,7 +248,7 @@ nxge_hio_vr_add(nxge_t *nxge)
 		return (NXGE_ERROR);
 	}
 
-	cookie = (uint32_t)reg_val[0];
+	cookie = (uint32_t)(reg_val[0]);
 	ddi_prop_free(reg_val);
 
 	fp = &nhd->hio.vr;
@@ -521,11 +520,17 @@ res_map_parse(
 	 */
 	if (type == NXGE_TRANSMIT_GROUP) {
 		nxge_dma_pt_cfg_t *port = &nxge->pt_config;
+		nxge_tdc_grp_t *tdc_grp = &nxge->pt_config.tdc_grps[0];
 
 		hardware->tdc.start = first;
 		hardware->tdc.count = count;
 		hardware->tdc.owned = count;
 
+		tdc_grp->start_tdc = first;
+		tdc_grp->max_tdcs = (uint8_t)count;
+		tdc_grp->grp_index = group->index;
+		tdc_grp->map = slots;
+
 		group->map = slots;
 
 		/*
@@ -944,7 +949,6 @@ nxge_check_guest_state(
 	NXGE_DEBUG_MSG((nxge, SYSERR_CTL, "==> nxge_check_guest_state"));
 
 	MUTEX_ENTER(nxge->genlock);
-
 	nxge->nxge_timerid = 0;
 
 	if (nxge->nxge_mac_state == NXGE_MAC_STARTED) {
diff --git a/usr/src/uts/common/io/nxge/nxge_hv.c b/usr/src/uts/common/io/nxge/nxge_hv.c
index a454b3ee72..1a42fcd9a7 100644
--- a/usr/src/uts/common/io/nxge/nxge_hv.c
+++ b/usr/src/uts/common/io/nxge/nxge_hv.c
@@ -24,8 +24,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * nxge_hv.c
  *
@@ -37,6 +35,8 @@
 #include <sys/nxge/nxge_impl.h>
 #include <sys/nxge/nxge_hio.h>
 
+#if defined(sun4v)
+
 void
 nxge_hio_hv_init(nxge_t *nxge)
 {
@@ -79,3 +79,5 @@ nxge_hio_hv_init(nxge_t *nxge)
 
 	rx->getinfo = &hv_niu_vrrx_getinfo;
 }
+
+#endif /* defined(sun4v) */
diff --git a/usr/src/uts/common/io/nxge/nxge_hw.c b/usr/src/uts/common/io/nxge/nxge_hw.c
index 4a6cbbea6d..5513ce4f4e 100644
--- a/usr/src/uts/common/io/nxge/nxge_hw.c
+++ b/usr/src/uts/common/io/nxge/nxge_hw.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/nxge/nxge_impl.h>
 
 /*
@@ -221,7 +219,6 @@ nxge_intr(void *arg1, void *arg2)
 		NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_intr(%d): #ldvs %d "
 		    " #intrs %d", i, nldvs, nintrs));
 		/* Get this group's flag bits.  */
-		t_ldgp->interrupted = B_FALSE;
 		rs = npi_ldsv_ldfs_get(handle, t_ldgp->ldg,
 		    &vector0, &vector1, &vector2);
 		if (rs) {
@@ -235,7 +232,6 @@ nxge_intr(void *arg1, void *arg2)
 		NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_intr: "
 		    "vector0 0x%llx vector1 0x%llx vector2 0x%llx",
 		    vector0, vector1, vector2));
-		t_ldgp->interrupted = B_TRUE;
 		nldvs = t_ldgp->nldvs;
 		for (j = 0; j < nldvs; j++, t_ldvp++) {
 			/*
@@ -261,12 +257,10 @@ nxge_intr(void *arg1, void *arg2)
 	t_ldgp = ldgp;
 	for (i = 0; i < nintrs; i++, t_ldgp++) {
 		/* rearm group interrupts */
-		if (t_ldgp->interrupted) {
-			NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_intr: arm "
-			    "group %d", t_ldgp->ldg));
-			(void) npi_intr_ldg_mgmt_set(handle, t_ldgp->ldg,
-			    t_ldgp->arm, t_ldgp->ldg_timer);
-		}
+		NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_intr: arm "
+		    "group %d", t_ldgp->ldg));
+		(void) npi_intr_ldg_mgmt_set(handle, t_ldgp->ldg,
+		    t_ldgp->arm, t_ldgp->ldg_timer);
 	}
 
 	NXGE_DEBUG_MSG((nxgep, INT_CTL, "<== nxge_intr: serviced 0x%x",
diff --git a/usr/src/uts/common/io/nxge/nxge_mac.c b/usr/src/uts/common/io/nxge/nxge_mac.c
index d009bdbd98..8ca60cf7a7 100644
--- a/usr/src/uts/common/io/nxge/nxge_mac.c
+++ b/usr/src/uts/common/io/nxge/nxge_mac.c
@@ -46,13 +46,6 @@ extern uint32_t nxge_lb_dbg;
 extern boolean_t nxge_jumbo_enable;
 extern uint32_t nxge_jumbo_mtu;
 
-	/* The following functions may be found in nxge_main.c */
-extern void nxge_mmac_kstat_update(p_nxge_t nxgep, mac_addr_slot_t slot,
-	boolean_t factory);
-extern int nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr);
-extern int nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot);
-extern int nxge_altmac_set(p_nxge_t nxgep, uint8_t *mac_addr,
-	mac_addr_slot_t slot, uint8_t rdctbl);
 
 typedef enum {
 	CHECK_LINK_RESCHEDULE,
@@ -3040,160 +3033,6 @@ fail:
 	return (NXGE_ERROR | rs);
 }
 
-int
-nxge_hio_hostinfo_get_rdc_table(p_nxge_t nxgep)
-{
-	int rdc_tbl;
-
-	/*
-	 * Get an RDC table (version 0).
-	 */
-	if ((rdc_tbl = nxge_fzc_rdc_tbl_bind(nxgep, -1, B_FALSE)) < 0) {
-		NXGE_ERROR_MSG((nxgep, OBP_CTL,
-		    "nxge_hio_hostinfo_get_rdc_table: "
-		    "there are no free RDC tables!"));
-		return (EBUSY);
-	}
-
-	return (rdc_tbl);
-}
-
-/*
- * nxge_hio_hostinfo_init
- *
- *	Initialize an alternate MAC address, and bind a macrdctbln to it.
- *
- * Arguments:
- * 	nxge
- * 	vr	The Virtualization Region
- * 	macaddr	The alternate MAC address
- *
- * Notes:
- *	1. Find & bind an RDC table to <nxge>.
- *	2. Program an alternate MAC address (<macaddr>).
- *	3. Bind the RDC table to <macaddr>.
- *
- * Context:
- *	Service domain
- *
- * Side Effects:
- *	nxge->class_config.mac_host_info[slot].rdctbl
- *	vr->slot & vr->altmac
- *
- */
-int
-nxge_hio_hostinfo_init(nxge_t *nxge, nxge_hio_vr_t *vr, ether_addr_t *macaddr)
-{
-	int slot, error;
-	uint8_t rdc_tbl;
-	nxge_mmac_t *mmac_info;
-	nxge_rdc_grp_t	*group;
-	uint8_t *addr = (uint8_t *)macaddr;
-
-	mutex_enter(nxge->genlock);
-
-	rdc_tbl = (uint8_t)vr->rdc_tbl;
-
-	/* Initialize the NXGE RDC table data structure. */
-	group = &nxge->pt_config.rdc_grps[rdc_tbl];
-	group->port = NXGE_GET_PORT_NUM(nxge->function_num);
-	group->config_method = RDC_TABLE_ENTRY_METHOD_REP;
-	group->flag = 1;	/* This group has been configured. */
-
-	mmac_info = &nxge->nxge_mmac_info;
-
-	/*
-	 * Are there free slots.
-	 */
-	if (mmac_info->naddrfree == 0) {
-		mutex_exit(nxge->genlock);
-		return (ENOSPC);
-	}
-
-	/*
-	 * Find a slot for the VR to use for Hybrid I/O.
-	 */
-	if (mmac_info->num_factory_mmac < mmac_info->num_mmac) {
-		for (slot = mmac_info->num_factory_mmac + 1;
-		    slot <= mmac_info->num_mmac; slot++) {
-			if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
-				break;
-		}
-		if (slot > mmac_info->num_mmac) {
-			for (slot = 1; slot <= mmac_info->num_factory_mmac;
-			    slot++) {
-				if (!(mmac_info->mac_pool[slot].flags
-				    & MMAC_SLOT_USED))
-					break;
-			}
-		}
-	} else {
-		for (slot = 1; slot <= mmac_info->num_mmac; slot++) {
-			if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
-				break;
-		}
-	}
-	ASSERT(slot <= mmac_info->num_mmac);
-	vr->slot = slot;
-
-	error = nxge_altmac_set(nxge, addr, slot, rdc_tbl);
-	if (error != 0) {
-		mutex_exit(nxge->genlock);
-		return (EIO);
-	}
-
-	bcopy(macaddr, vr->altmac, sizeof (vr->altmac));
-
-	/*
-	 * Update mmac
-	 */
-	bcopy(addr, mmac_info->mac_pool[vr->slot].addr, ETHERADDRL);
-	mmac_info->mac_pool[vr->slot].flags |= MMAC_SLOT_USED;
-	mmac_info->mac_pool[vr->slot].flags &= ~MMAC_VENDOR_ADDR;
-	mmac_info->naddrfree--;
-	nxge_mmac_kstat_update(nxge, vr->slot, B_FALSE);
-
-	mutex_exit(nxge->genlock);
-	return (0);
-}
-
-/*
- * nxge_hio_hostinfo_uninit
- *
- *	Uninitialize an alternate MAC address.
- *
- * Arguments:
- * 	nxge
- * 	vr	The Virtualization Region
- *
- * Notes:
- *	Remove the VR's alternate MAC address.
- *
- * Context:
- *	Service domain
- *
- * Side Effects:
- *	nxge->class_config.mac_host_info[slot].rdctbl
- *
- */
-void
-nxge_hio_hostinfo_uninit(nxge_t *nxge, nxge_hio_vr_t *vr)
-{
-	nxge_class_pt_cfg_t *class;
-	uint8_t addrn;
-
-	addrn = vr->slot - 1;
-	(void) npi_mac_altaddr_disable(nxge->npi_handle,
-	    nxge->mac.portnum, addrn);
-
-	/* Set this variable to its default. */
-	class = (p_nxge_class_pt_cfg_t)&nxge->class_config;
-	class->mac_host_info[addrn].rdctbl =
-	    nxge->pt_config.hw_config.def_mac_rxdma_grpid;
-
-	(void) nxge_m_mmac_remove(nxge, vr->slot);
-	vr->slot = -1;
-}
 
 /* Initialize the RxMAC sub-block */
 
diff --git a/usr/src/uts/common/io/nxge/nxge_main.c b/usr/src/uts/common/io/nxge/nxge_main.c
index ca2ca6b30b..9b20c438f4 100644
--- a/usr/src/uts/common/io/nxge/nxge_main.c
+++ b/usr/src/uts/common/io/nxge/nxge_main.c
@@ -117,14 +117,6 @@ nxge_tx_mode_t	nxge_tx_scheme = NXGE_USE_SERIAL;
 #define		NXGE_LSO_MAXLEN	65535
 uint32_t	nxge_lso_max = NXGE_LSO_MAXLEN;
 
-/*
- * Debugging flags:
- *		nxge_no_tx_lb : transmit load balancing
- *		nxge_tx_lb_policy: 0 - TCP port (default)
- *				   3 - DEST MAC
- */
-uint32_t 	nxge_no_tx_lb = 0;
-uint32_t 	nxge_tx_lb_policy = NXGE_TX_LB_TCPUDP;
 
 /*
  * Add tunable to reduce the amount of time spent in the
@@ -208,8 +200,7 @@ static void nxge_remove_hard_properties(p_nxge_t);
 /*
  * These two functions are required by nxge_hio.c
  */
-extern int nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr);
-extern int nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot);
+extern int nxge_m_mmac_remove(void *arg, int slot);
 extern void nxge_grp_cleanup(p_nxge_t nxge);
 
 static nxge_status_t nxge_setup_system_dma_pages(p_nxge_t);
@@ -224,9 +215,7 @@ static void nxge_test_map_regs(p_nxge_t nxgep);
 #endif
 
 static nxge_status_t nxge_add_intrs(p_nxge_t nxgep);
-static nxge_status_t nxge_add_soft_intrs(p_nxge_t nxgep);
 static void nxge_remove_intrs(p_nxge_t nxgep);
-static void nxge_remove_soft_intrs(p_nxge_t nxgep);
 
 static nxge_status_t nxge_add_intrs_adv(p_nxge_t nxgep);
 static nxge_status_t nxge_add_intrs_adv_type(p_nxge_t, uint32_t);
@@ -284,20 +273,19 @@ extern int nxge_param_set_mac(p_nxge_t, queue_t *, mblk_t *,
  */
 static int nxge_m_start(void *);
 static void nxge_m_stop(void *);
-static int nxge_m_unicst(void *, const uint8_t *);
 static int nxge_m_multicst(void *, boolean_t, const uint8_t *);
 static int nxge_m_promisc(void *, boolean_t);
 static void nxge_m_ioctl(void *, queue_t *, mblk_t *);
-static void nxge_m_resources(void *);
-mblk_t *nxge_m_tx(void *arg, mblk_t *);
 static nxge_status_t nxge_mac_register(p_nxge_t);
-int nxge_altmac_set(p_nxge_t nxgep, uint8_t *mac_addr,
-	mac_addr_slot_t slot, uint8_t rdctbl);
-void nxge_mmac_kstat_update(p_nxge_t nxgep, mac_addr_slot_t slot,
+static int nxge_altmac_set(p_nxge_t nxgep, uint8_t *mac_addr,
+	int slot, int rdctbl, boolean_t usetbl);
+void nxge_mmac_kstat_update(p_nxge_t nxgep, int slot,
 	boolean_t factory);
-static int nxge_m_mmac_reserve(void *arg, mac_multi_addr_t *maddr);
-static int nxge_m_mmac_modify(void *arg, mac_multi_addr_t *maddr);
-static int nxge_m_mmac_get(void *arg, mac_multi_addr_t *maddr);
+#if defined(sun4v)
+extern mblk_t *nxge_m_tx(void *arg, mblk_t *mp);
+#endif
+
+static void nxge_m_getfactaddr(void *, uint_t, uint8_t *);
 static	boolean_t nxge_m_getcapab(void *, mac_capab_t, void *);
 static int nxge_m_setprop(void *, const char *, mac_prop_id_t,
     uint_t, const void *);
@@ -308,6 +296,12 @@ static int nxge_set_priv_prop(nxge_t *, const char *, uint_t,
 static int nxge_get_priv_prop(nxge_t *, const char *, uint_t, uint_t,
     void *, uint_t *);
 static int nxge_get_def_val(nxge_t *, mac_prop_id_t, uint_t, void *);
+static void nxge_fill_ring(void *, mac_ring_type_t, const int, const int,
+    mac_ring_info_t *, mac_ring_handle_t);
+static void nxge_group_add_ring(mac_group_driver_t, mac_ring_driver_t,
+    mac_ring_type_t);
+static void nxge_group_rem_ring(mac_group_driver_t, mac_ring_driver_t,
+    mac_ring_type_t);
 
 static void nxge_niu_peu_reset(p_nxge_t nxgep);
 static void nxge_set_pci_replay_timeout(nxge_t *);
@@ -336,15 +330,11 @@ mac_priv_prop_t nxge_priv_props[] = {
 #define	NXGE_MAX_PRIV_PROPS	\
 	(sizeof (nxge_priv_props)/sizeof (mac_priv_prop_t))
 
-#define	NXGE_M_CALLBACK_FLAGS\
-	(MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
-
-
 #define	NXGE_NEPTUNE_MAGIC	0x4E584745UL
 #define	MAX_DUMP_SZ 256
 
 #define	NXGE_M_CALLBACK_FLAGS	\
-	(MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
+	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP)
 
 mac_callbacks_t nxge_m_callbacks = {
 	NXGE_M_CALLBACK_FLAGS,
@@ -353,9 +343,8 @@ mac_callbacks_t nxge_m_callbacks = {
 	nxge_m_stop,
 	nxge_m_promisc,
 	nxge_m_multicst,
-	nxge_m_unicst,
-	nxge_m_tx,
-	nxge_m_resources,
+	NULL,
+	NULL,
 	nxge_m_ioctl,
 	nxge_m_getcapab,
 	NULL,
@@ -631,6 +620,11 @@ nxge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	if (nxgep->niu_type != N2_NIU) {
 		nxge_set_pci_replay_timeout(nxgep);
 	}
+#if defined(sun4v)
+	if (isLDOMguest(nxgep)) {
+		nxge_m_callbacks.mc_tx = nxge_m_tx;
+	}
+#endif
 
 #if defined(sun4v)
 	/* This is required by nxge_hio_init(), which follows. */
@@ -847,13 +841,6 @@ nxge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 		goto nxge_attach_fail;
 	}
 
-	status = nxge_add_soft_intrs(nxgep);
-	if (status != DDI_SUCCESS) {
-		NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
-		    "add_soft_intr failed"));
-		goto nxge_attach_fail;
-	}
-
 	/* If a guest, register with vio_net instead. */
 	if ((status = nxge_mac_register(nxgep)) != NXGE_OK) {
 		NXGE_DEBUG_MSG((nxgep, DDI_CTL,
@@ -1032,9 +1019,6 @@ nxge_unattach(p_nxge_t nxgep)
 	 */
 	nxge_remove_intrs(nxgep);
 
-	/* remove soft interrups */
-	nxge_remove_soft_intrs(nxgep);
-
 	/*
 	 * Stop the device and free resources.
 	 */
@@ -3742,6 +3726,20 @@ nxge_m_start_exit:
 	return (0);
 }
 
+
+static boolean_t
+nxge_check_groups_stopped(p_nxge_t nxgep)
+{
+	int	i;
+
+	for (i = 0; i < NXGE_MAX_RDC_GROUPS; i++) {
+		if (nxgep->rx_hio_groups[i].started)
+			return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
 /*
  *	nxge_m_stop(): stop transmitting and receiving.
  */
@@ -3749,9 +3747,21 @@ static void
 nxge_m_stop(void *arg)
 {
 	p_nxge_t 	nxgep = (p_nxge_t)arg;
+	boolean_t	groups_stopped;
 
 	NXGE_DEBUG_MSG((nxgep, NXGE_CTL, "==> nxge_m_stop"));
 
+	groups_stopped = nxge_check_groups_stopped(nxgep);
+#ifdef later
+	ASSERT(groups_stopped == B_FALSE);
+#endif
+
+	if (!groups_stopped) {
+		cmn_err(CE_WARN, "nxge(%d): groups are not stopped!\n",
+		    nxgep->instance);
+		return;
+	}
+
 	MUTEX_ENTER(nxgep->genlock);
 	nxgep->nxge_mac_state = NXGE_MAC_STOPPING;
 
@@ -3770,26 +3780,6 @@ nxge_m_stop(void *arg)
 }
 
 static int
-nxge_m_unicst(void *arg, const uint8_t *macaddr)
-{
-	p_nxge_t 	nxgep = (p_nxge_t)arg;
-	struct 		ether_addr addrp;
-
-	NXGE_DEBUG_MSG((nxgep, MAC_CTL, "==> nxge_m_unicst"));
-
-	bcopy(macaddr, (uint8_t *)&addrp, ETHERADDRL);
-	if (nxge_set_mac_addr(nxgep, &addrp)) {
-		NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL,
-		    "<== nxge_m_unicst: set unitcast failed"));
-		return (EINVAL);
-	}
-
-	NXGE_DEBUG_MSG((nxgep, MAC_CTL, "<== nxge_m_unicst"));
-
-	return (0);
-}
-
-static int
 nxge_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
 {
 	p_nxge_t 	nxgep = (p_nxge_t)arg;
@@ -3942,77 +3932,8 @@ nxge_m_ioctl(void *arg,  queue_t *wq, mblk_t *mp)
 
 extern void nxge_rx_hw_blank(void *arg, time_t ticks, uint_t count);
 
-static void
-nxge_m_resources(void *arg)
-{
-	p_nxge_t		nxgep = arg;
-	mac_rx_fifo_t 		mrf;
-
-	nxge_grp_set_t		*set = &nxgep->rx_set;
-	uint8_t			rdc;
-
-	rx_rcr_ring_t		*ring;
-
-	NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_m_resources"));
-
-	MUTEX_ENTER(nxgep->genlock);
-
-	if (set->owned.map == 0) {
-		NXGE_ERROR_MSG((NULL, NXGE_ERR_CTL,
-		    "nxge_m_resources: no receive resources"));
-		goto nxge_m_resources_exit;
-	}
-
-	/*
-	 * CR 6492541 Check to see if the drv_state has been initialized,
-	 * if not * call nxge_init().
-	 */
-	if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) {
-		if (nxge_init(nxgep) != NXGE_OK)
-			goto nxge_m_resources_exit;
-	}
-
-	mrf.mrf_type = MAC_RX_FIFO;
-	mrf.mrf_blank = nxge_rx_hw_blank;
-	mrf.mrf_arg = (void *)nxgep;
-
-	mrf.mrf_normal_blank_time = 128;
-	mrf.mrf_normal_pkt_count = 8;
-
-	/*
-	 * Export our receive resources to the MAC layer.
-	 */
-	for (rdc = 0; rdc < NXGE_MAX_RDCS; rdc++) {
-		if ((1 << rdc) & set->owned.map) {
-			ring = nxgep->rx_rcr_rings->rcr_rings[rdc];
-			if (ring == 0) {
-				/*
-				 * This is a big deal only if we are
-				 * *not* in an LDOMs environment.
-				 */
-				if (nxgep->environs == SOLARIS_DOMAIN) {
-					cmn_err(CE_NOTE,
-					    "==> nxge_m_resources: "
-					    "ring %d == 0", rdc);
-				}
-				continue;
-			}
-			ring->rcr_mac_handle = mac_resource_add
-			    (nxgep->mach, (mac_resource_t *)&mrf);
-
-			NXGE_DEBUG_MSG((nxgep, NXGE_CTL,
-			    "==> nxge_m_resources: RDC %d RCR %p MAC handle %p",
-			    rdc, ring, ring->rcr_mac_handle));
-		}
-	}
-
-nxge_m_resources_exit:
-	MUTEX_EXIT(nxgep->genlock);
-	NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_m_resources"));
-}
-
 void
-nxge_mmac_kstat_update(p_nxge_t nxgep, mac_addr_slot_t slot, boolean_t factory)
+nxge_mmac_kstat_update(p_nxge_t nxgep, int slot, boolean_t factory)
 {
 	p_nxge_mmac_stats_t mmac_stats;
 	int i;
@@ -4040,9 +3961,9 @@ nxge_mmac_kstat_update(p_nxge_t nxgep, mac_addr_slot_t slot, boolean_t factory)
 /*
  * nxge_altmac_set() -- Set an alternate MAC address
  */
-int
-nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot,
-	uint8_t rdctbl)
+static int
+nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, int slot,
+	int rdctbl, boolean_t usetbl)
 {
 	uint8_t addrn;
 	uint8_t portn;
@@ -4050,6 +3971,7 @@ nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot,
 	hostinfo_t mac_rdc;
 	p_nxge_class_pt_cfg_t clscfgp;
 
+
 	altmac.w2 = ((uint16_t)maddr[0] << 8) | ((uint16_t)maddr[1] & 0x0ff);
 	altmac.w1 = ((uint16_t)maddr[2] << 8) | ((uint16_t)maddr[3] & 0x0ff);
 	altmac.w0 = ((uint16_t)maddr[4] << 8) | ((uint16_t)maddr[5] & 0x0ff);
@@ -4057,8 +3979,8 @@ nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot,
 	portn = nxgep->mac.portnum;
 	addrn = (uint8_t)slot - 1;
 
-	if (npi_mac_altaddr_entry(nxgep->npi_handle, OP_SET, portn,
-	    addrn, &altmac) != NPI_SUCCESS)
+	if (npi_mac_altaddr_entry(nxgep->npi_handle, OP_SET,
+	    nxgep->function_num, addrn, &altmac) != NPI_SUCCESS)
 		return (EIO);
 
 	/*
@@ -4067,8 +3989,11 @@ nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot,
 	 */
 	clscfgp = (p_nxge_class_pt_cfg_t)&nxgep->class_config;
 	mac_rdc.value = 0;
-	clscfgp->mac_host_info[addrn].rdctbl = rdctbl;
-	mac_rdc.bits.w0.rdc_tbl_num = rdctbl;
+	if (usetbl)
+		mac_rdc.bits.w0.rdc_tbl_num = rdctbl;
+	else
+		mac_rdc.bits.w0.rdc_tbl_num =
+		    clscfgp->mac_host_info[addrn].rdctbl;
 	mac_rdc.bits.w0.mac_pref = clscfgp->mac_host_info[addrn].mpr_npr;
 
 	if (npi_mac_hostinfo_entry(nxgep->npi_handle, OP_SET,
@@ -4088,22 +4013,25 @@ nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot,
 	else
 		addrn = (uint8_t)slot;
 
-	if (npi_mac_altaddr_enable(nxgep->npi_handle, portn, addrn)
-	    != NPI_SUCCESS)
+	if (npi_mac_altaddr_enable(nxgep->npi_handle,
+	    nxgep->function_num, addrn) != NPI_SUCCESS) {
 		return (EIO);
+	}
+
 	return (0);
 }
 
 /*
- * nxeg_m_mmac_add() - find an unused address slot, set the address
+ * nxeg_m_mmac_add_g() - find an unused address slot, set the address
  * value to the one specified, enable the port to start filtering on
  * the new MAC address.  Returns 0 on success.
  */
 int
-nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr)
+nxge_m_mmac_add_g(void *arg, const uint8_t *maddr, int rdctbl,
+	boolean_t usetbl)
 {
 	p_nxge_t nxgep = arg;
-	mac_addr_slot_t slot;
+	int slot;
 	nxge_mmac_t *mmac_info;
 	int err;
 	nxge_status_t status;
@@ -4127,16 +4055,10 @@ nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr)
 		mutex_exit(nxgep->genlock);
 		return (ENOSPC);
 	}
-	if (!mac_unicst_verify(nxgep->mach, maddr->mma_addr,
-	    maddr->mma_addrlen)) {
-		mutex_exit(nxgep->genlock);
-		return (EINVAL);
-	}
+
 	/*
 	 * 	Search for the first available slot. Because naddrfree
 	 * is not zero, we are guaranteed to find one.
-	 * 	Slot 0 is for unique (primary) MAC. The first alternate
-	 * MAC slot is slot 1.
 	 *	Each of the first two ports of Neptune has 16 alternate
 	 * MAC slots but only the first 7 (of 15) slots have assigned factory
 	 * MAC addresses. We first search among the slots without bundled
@@ -4146,131 +4068,26 @@ nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr)
 	 * But the slot could be used by factory MAC again after calling
 	 * nxge_m_mmac_remove and nxge_m_mmac_reserve.
 	 */
-	if (mmac_info->num_factory_mmac < mmac_info->num_mmac) {
-		for (slot = mmac_info->num_factory_mmac + 1;
-		    slot <= mmac_info->num_mmac; slot++) {
-			if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
-				break;
-		}
-		if (slot > mmac_info->num_mmac) {
-			for (slot = 1; slot <= mmac_info->num_factory_mmac;
-			    slot++) {
-				if (!(mmac_info->mac_pool[slot].flags
-				    & MMAC_SLOT_USED))
-					break;
-			}
-		}
-	} else {
-		for (slot = 1; slot <= mmac_info->num_mmac; slot++) {
-			if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
-				break;
-		}
+	for (slot = 0; slot <= mmac_info->num_mmac; slot++) {
+		if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
+			break;
 	}
+
 	ASSERT(slot <= mmac_info->num_mmac);
 
-	/*
-	 * def_mac_rxdma_grpid is the default rdc table for the port.
-	 */
-	if ((err = nxge_altmac_set(nxgep, maddr->mma_addr, slot,
-	    nxgep->pt_config.hw_config.def_mac_rxdma_grpid)) != 0) {
+	if ((err = nxge_altmac_set(nxgep, (uint8_t *)maddr, slot, rdctbl,
+	    usetbl)) != 0) {
 		mutex_exit(nxgep->genlock);
 		return (err);
 	}
 
-	bcopy(maddr->mma_addr, mmac_info->mac_pool[slot].addr, ETHERADDRL);
+	bcopy(maddr, mmac_info->mac_pool[slot].addr, ETHERADDRL);
 	mmac_info->mac_pool[slot].flags |= MMAC_SLOT_USED;
 	mmac_info->mac_pool[slot].flags &= ~MMAC_VENDOR_ADDR;
 	mmac_info->naddrfree--;
 	nxge_mmac_kstat_update(nxgep, slot, B_FALSE);
 
-	maddr->mma_slot = slot;
-
-	mutex_exit(nxgep->genlock);
-	return (0);
-}
-
-/*
- * This function reserves an unused slot and programs the slot and the HW
- * with a factory mac address.
- */
-static int
-nxge_m_mmac_reserve(void *arg, mac_multi_addr_t *maddr)
-{
-	p_nxge_t nxgep = arg;
-	mac_addr_slot_t slot;
-	nxge_mmac_t *mmac_info;
-	int err;
-	nxge_status_t status;
-
-	mutex_enter(nxgep->genlock);
-
-	/*
-	 * Make sure that nxge is initialized, if _start() has
-	 * not been called.
-	 */
-	if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) {
-		status = nxge_init(nxgep);
-		if (status != NXGE_OK) {
-			mutex_exit(nxgep->genlock);
-			return (ENXIO);
-		}
-	}
-
-	mmac_info = &nxgep->nxge_mmac_info;
-	if (mmac_info->naddrfree == 0) {
-		mutex_exit(nxgep->genlock);
-		return (ENOSPC);
-	}
-
-	slot = maddr->mma_slot;
-	if (slot == -1) {  /* -1: Take the first available slot */
-		for (slot = 1; slot <= mmac_info->num_factory_mmac; slot++) {
-			if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED))
-				break;
-		}
-		if (slot > mmac_info->num_factory_mmac) {
-			mutex_exit(nxgep->genlock);
-			return (ENOSPC);
-		}
-	}
-	if (slot < 1 || slot > mmac_info->num_factory_mmac) {
-		/*
-		 * Do not support factory MAC at a slot greater than
-		 * num_factory_mmac even when there are available factory
-		 * MAC addresses because the alternate MACs are bundled with
-		 * slot[1] through slot[num_factory_mmac]
-		 */
-		mutex_exit(nxgep->genlock);
-		return (EINVAL);
-	}
-	if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) {
-		mutex_exit(nxgep->genlock);
-		return (EBUSY);
-	}
-	/* Verify the address to be reserved */
-	if (!mac_unicst_verify(nxgep->mach,
-	    mmac_info->factory_mac_pool[slot], ETHERADDRL)) {
-		mutex_exit(nxgep->genlock);
-		return (EINVAL);
-	}
-	if (err = nxge_altmac_set(nxgep,
-	    mmac_info->factory_mac_pool[slot], slot,
-	    nxgep->pt_config.hw_config.def_mac_rxdma_grpid)) {
-		mutex_exit(nxgep->genlock);
-		return (err);
-	}
-	bcopy(mmac_info->factory_mac_pool[slot], maddr->mma_addr, ETHERADDRL);
-	mmac_info->mac_pool[slot].flags |= MMAC_SLOT_USED | MMAC_VENDOR_ADDR;
-	mmac_info->naddrfree--;
-
-	nxge_mmac_kstat_update(nxgep, slot, B_TRUE);
 	mutex_exit(nxgep->genlock);
-
-	/* Pass info back to the caller */
-	maddr->mma_slot = slot;
-	maddr->mma_addrlen = ETHERADDRL;
-	maddr->mma_flags = MMAC_SLOT_USED | MMAC_VENDOR_ADDR;
-
 	return (0);
 }
 
@@ -4279,7 +4096,7 @@ nxge_m_mmac_reserve(void *arg, mac_multi_addr_t *maddr)
  * the mac address anymore.
  */
 int
-nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot)
+nxge_m_mmac_remove(void *arg, int slot)
 {
 	p_nxge_t nxgep = arg;
 	nxge_mmac_t *mmac_info;
@@ -4350,141 +4167,37 @@ nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot)
 }
 
 /*
- * Modify a mac address added by nxge_m_mmac_add or nxge_m_mmac_reserve().
- */
-static int
-nxge_m_mmac_modify(void *arg, mac_multi_addr_t *maddr)
-{
-	p_nxge_t nxgep = arg;
-	mac_addr_slot_t slot;
-	nxge_mmac_t *mmac_info;
-	int err = 0;
-	nxge_status_t status;
-
-	if (!mac_unicst_verify(nxgep->mach, maddr->mma_addr,
-	    maddr->mma_addrlen))
-		return (EINVAL);
-
-	slot = maddr->mma_slot;
-
-	mutex_enter(nxgep->genlock);
-
-	/*
-	 * Make sure that nxge is initialized, if _start() has
-	 * not been called.
-	 */
-	if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) {
-		status = nxge_init(nxgep);
-		if (status != NXGE_OK) {
-			mutex_exit(nxgep->genlock);
-			return (ENXIO);
-		}
-	}
-
-	mmac_info = &nxgep->nxge_mmac_info;
-	if (slot < 1 || slot > mmac_info->num_mmac) {
-		mutex_exit(nxgep->genlock);
-		return (EINVAL);
-	}
-	if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) {
-		if ((err = nxge_altmac_set(nxgep,
-		    maddr->mma_addr, slot,
-		    nxgep->pt_config.hw_config.def_mac_rxdma_grpid)) != 0) {
-			bcopy(maddr->mma_addr, mmac_info->mac_pool[slot].addr,
-			    ETHERADDRL);
-			/*
-			 * Assume that the MAC passed down from the caller
-			 * is not a factory MAC address (The user should
-			 * call mmac_remove followed by mmac_reserve if
-			 * he wants to use the factory MAC for this slot).
-			 */
-			mmac_info->mac_pool[slot].flags &= ~MMAC_VENDOR_ADDR;
-			nxge_mmac_kstat_update(nxgep, slot, B_FALSE);
-		}
-	} else {
-		err = EINVAL;
-	}
-	mutex_exit(nxgep->genlock);
-	return (err);
-}
-
-/*
- * nxge_m_mmac_get() - Get the MAC address and other information
- * related to the slot.  mma_flags should be set to 0 in the call.
- * Note: although kstat shows MAC address as zero when a slot is
- * not used, Crossbow expects nxge_m_mmac_get to copy factory MAC
- * to the caller as long as the slot is not using a user MAC address.
- * The following table shows the rules,
- *
- *				   USED    VENDOR    mma_addr
- * ------------------------------------------------------------
- * (1) Slot uses a user MAC:        yes      no     user MAC
- * (2) Slot uses a factory MAC:     yes      yes    factory MAC
- * (3) Slot is not used but is
- *     factory MAC capable:         no       yes    factory MAC
- * (4) Slot is not used and is
- *     not factory MAC capable:     no       no        0
- * ------------------------------------------------------------
+ * The callback to query all the factory addresses. naddr must be the same as
+ * the number of factory addresses (returned by MAC_CAPAB_MULTIFACTADDR), and
+ * mcm_addr is the space allocated for keep all the addresses, whose size is
+ * naddr * MAXMACADDRLEN.
  */
-static int
-nxge_m_mmac_get(void *arg, mac_multi_addr_t *maddr)
+static void
+nxge_m_getfactaddr(void *arg, uint_t naddr, uint8_t *addr)
 {
-	nxge_t *nxgep = arg;
-	mac_addr_slot_t slot;
-	nxge_mmac_t *mmac_info;
-	nxge_status_t status;
-
-	slot = maddr->mma_slot;
+	nxge_t		*nxgep = arg;
+	nxge_mmac_t	*mmac_info;
+	int		i;
 
 	mutex_enter(nxgep->genlock);
 
-	/*
-	 * Make sure that nxge is initialized, if _start() has
-	 * not been called.
-	 */
-	if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) {
-		status = nxge_init(nxgep);
-		if (status != NXGE_OK) {
-			mutex_exit(nxgep->genlock);
-			return (ENXIO);
-		}
-	}
-
 	mmac_info = &nxgep->nxge_mmac_info;
+	ASSERT(naddr == mmac_info->num_factory_mmac);
 
-	if (slot < 1 || slot > mmac_info->num_mmac) {
-		mutex_exit(nxgep->genlock);
-		return (EINVAL);
+	for (i = 0; i < naddr; i++) {
+		bcopy(mmac_info->factory_mac_pool[i + 1],
+		    addr + i * MAXMACADDRLEN, ETHERADDRL);
 	}
-	maddr->mma_flags = 0;
-	if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED)
-		maddr->mma_flags |= MMAC_SLOT_USED;
 
-	if (mmac_info->mac_pool[slot].flags & MMAC_VENDOR_ADDR) {
-		maddr->mma_flags |= MMAC_VENDOR_ADDR;
-		bcopy(mmac_info->factory_mac_pool[slot],
-		    maddr->mma_addr, ETHERADDRL);
-		maddr->mma_addrlen = ETHERADDRL;
-	} else {
-		if (maddr->mma_flags & MMAC_SLOT_USED) {
-			bcopy(mmac_info->mac_pool[slot].addr,
-			    maddr->mma_addr, ETHERADDRL);
-			maddr->mma_addrlen = ETHERADDRL;
-		} else {
-			bzero(maddr->mma_addr, ETHERADDRL);
-			maddr->mma_addrlen = 0;
-		}
-	}
 	mutex_exit(nxgep->genlock);
-	return (0);
 }
 
+
 static boolean_t
 nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 {
 	nxge_t *nxgep = arg;
 	uint32_t *txflags = cap_data;
-	multiaddress_capab_t *mmacp = cap_data;
 
 	switch (cap) {
 	case MAC_CAPAB_HCKSUM:
@@ -4495,33 +4208,15 @@ nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 		}
 		break;
 
-	case MAC_CAPAB_POLL:
-		/*
-		 * There's nothing for us to fill in, simply returning
-		 * B_TRUE stating that we support polling is sufficient.
-		 */
-		break;
+	case MAC_CAPAB_MULTIFACTADDR: {
+		mac_capab_multifactaddr_t	*mfacp = cap_data;
 
-	case MAC_CAPAB_MULTIADDRESS:
-		mmacp = (multiaddress_capab_t *)cap_data;
 		mutex_enter(nxgep->genlock);
-
-		mmacp->maddr_naddr = nxgep->nxge_mmac_info.num_mmac;
-		mmacp->maddr_naddrfree = nxgep->nxge_mmac_info.naddrfree;
-		mmacp->maddr_flag = 0; /* 0 is required by PSARC2006/265 */
-		/*
-		 * maddr_handle is driver's private data, passed back to
-		 * entry point functions as arg.
-		 */
-		mmacp->maddr_handle	= nxgep;
-		mmacp->maddr_add	= nxge_m_mmac_add;
-		mmacp->maddr_remove	= nxge_m_mmac_remove;
-		mmacp->maddr_modify	= nxge_m_mmac_modify;
-		mmacp->maddr_get	= nxge_m_mmac_get;
-		mmacp->maddr_reserve	= nxge_m_mmac_reserve;
-
+		mfacp->mcm_naddr = nxgep->nxge_mmac_info.num_factory_mmac;
+		mfacp->mcm_getaddr = nxge_m_getfactaddr;
 		mutex_exit(nxgep->genlock);
 		break;
+	}
 
 	case MAC_CAPAB_LSO: {
 		mac_capab_lso_t *cap_lso = cap_data;
@@ -4541,39 +4236,49 @@ nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 		}
 	}
 
-#if defined(sun4v)
 	case MAC_CAPAB_RINGS: {
-		mac_capab_rings_t *mrings = (mac_capab_rings_t *)cap_data;
-
-		/*
-		 * Only the service domain driver responds to
-		 * this capability request.
-		 */
-		if (isLDOMservice(nxgep)) {
-			mrings->mr_handle = (void *)nxgep;
+		mac_capab_rings_t	*cap_rings = cap_data;
+		p_nxge_hw_pt_cfg_t	p_cfgp = &nxgep->pt_config.hw_config;
 
-			/*
-			 * No dynamic allocation of groups and
-			 * rings at this time.  Shares dictate the
-			 * configuration.
-			 */
-			mrings->mr_gadd_ring = NULL;
-			mrings->mr_grem_ring = NULL;
-			mrings->mr_rget = NULL;
-			mrings->mr_gget = nxge_hio_group_get;
-
-			if (mrings->mr_type == MAC_RING_TYPE_RX) {
-				mrings->mr_rnum = 8; /* XXX */
-				mrings->mr_gnum = 6; /* XXX */
+		mutex_enter(nxgep->genlock);
+		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
+			cap_rings->mr_group_type = MAC_GROUP_TYPE_DYNAMIC;
+			cap_rings->mr_rnum = p_cfgp->max_rdcs;
+			cap_rings->mr_rget = nxge_fill_ring;
+			cap_rings->mr_gnum = p_cfgp->max_rdc_grpids;
+			cap_rings->mr_gget = nxge_hio_group_get;
+			cap_rings->mr_gaddring = nxge_group_add_ring;
+			cap_rings->mr_gremring = nxge_group_rem_ring;
+
+			NXGE_DEBUG_MSG((nxgep, RX_CTL,
+			    "==> nxge_m_getcapab: rx nrings[%d] ngroups[%d]",
+			    p_cfgp->max_rdcs, p_cfgp->max_rdc_grpids));
+		} else {
+			cap_rings->mr_group_type = MAC_GROUP_TYPE_DYNAMIC;
+			cap_rings->mr_rnum = p_cfgp->tdc.count;
+			cap_rings->mr_rget = nxge_fill_ring;
+			if (isLDOMservice(nxgep)) {
+				/* share capable */
+				/* Do not report the default ring: hence -1 */
+				cap_rings->mr_gnum =
+				    NXGE_MAX_TDC_GROUPS / nxgep->nports - 1;
 			} else {
-				mrings->mr_rnum = 8; /* XXX */
-				mrings->mr_gnum = 0; /* XXX */
+				cap_rings->mr_gnum = 0;
 			}
-		} else
-			return (B_FALSE);
+
+			cap_rings->mr_gget = nxge_hio_group_get;
+			cap_rings->mr_gaddring = nxge_group_add_ring;
+			cap_rings->mr_gremring = nxge_group_rem_ring;
+
+			NXGE_DEBUG_MSG((nxgep, TX_CTL,
+			    "==> nxge_m_getcapab: tx rings # of rings %d",
+			    p_cfgp->tdc.count));
+		}
+		mutex_exit(nxgep->genlock);
 		break;
 	}
 
+#if defined(sun4v)
 	case MAC_CAPAB_SHARES: {
 		mac_capab_share_t *mshares = (mac_capab_share_t *)cap_data;
 
@@ -4581,16 +4286,22 @@ nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 		 * Only the service domain driver responds to
 		 * this capability request.
 		 */
+		mutex_enter(nxgep->genlock);
 		if (isLDOMservice(nxgep)) {
 			mshares->ms_snum = 3;
 			mshares->ms_handle = (void *)nxgep;
 			mshares->ms_salloc = nxge_hio_share_alloc;
 			mshares->ms_sfree = nxge_hio_share_free;
-			mshares->ms_sadd = NULL;
-			mshares->ms_sremove = NULL;
+			mshares->ms_sadd = nxge_hio_share_add_group;
+			mshares->ms_sremove = nxge_hio_share_rem_group;
 			mshares->ms_squery = nxge_hio_share_query;
-		} else
+			mshares->ms_sbind = nxge_hio_share_bind;
+			mshares->ms_sunbind = nxge_hio_share_unbind;
+			mutex_exit(nxgep->genlock);
+		} else {
+			mutex_exit(nxgep->genlock);
 			return (B_FALSE);
+		}
 		break;
 	}
 #endif
@@ -5160,12 +4871,6 @@ nxge_set_priv_prop(p_nxge_t nxgep, const char *pr_name, uint_t pr_valsize,
 	}
 
 	if (strcmp(pr_name, "_soft_lso_enable") == 0) {
-		if (nxgep->nxge_mac_state == NXGE_MAC_STARTED) {
-			NXGE_DEBUG_MSG((nxgep, NXGE_CTL,
-			    "==> nxge_set_priv_prop: name %s (busy)", pr_name));
-			err = EBUSY;
-			return (err);
-		}
 		if (pr_val == NULL) {
 			NXGE_DEBUG_MSG((nxgep, NXGE_CTL,
 			    "==> nxge_set_priv_prop: name %s (null)", pr_name));
@@ -5695,6 +5400,290 @@ _info(struct modinfo *modinfop)
 }
 
 /*ARGSUSED*/
+static int
+nxge_tx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num)
+{
+	p_nxge_ring_handle_t	rhp = (p_nxge_ring_handle_t)rdriver;
+	p_nxge_t		nxgep = rhp->nxgep;
+	uint32_t		channel;
+	p_tx_ring_t		ring;
+
+	channel = nxgep->pt_config.hw_config.tdc.start + rhp->index;
+	ring = nxgep->tx_rings->rings[channel];
+
+	MUTEX_ENTER(&ring->lock);
+	ring->tx_ring_handle = rhp->ring_handle;
+	MUTEX_EXIT(&ring->lock);
+
+	return (0);
+}
+
+static void
+nxge_tx_ring_stop(mac_ring_driver_t rdriver)
+{
+	p_nxge_ring_handle_t	rhp = (p_nxge_ring_handle_t)rdriver;
+	p_nxge_t		nxgep = rhp->nxgep;
+	uint32_t		channel;
+	p_tx_ring_t		ring;
+
+	channel = nxgep->pt_config.hw_config.tdc.start + rhp->index;
+	ring = nxgep->tx_rings->rings[channel];
+
+	MUTEX_ENTER(&ring->lock);
+	ring->tx_ring_handle = (mac_ring_handle_t)NULL;
+	MUTEX_EXIT(&ring->lock);
+}
+
+static int
+nxge_rx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num)
+{
+	p_nxge_ring_handle_t	rhp = (p_nxge_ring_handle_t)rdriver;
+	p_nxge_t		nxgep = rhp->nxgep;
+	uint32_t		channel;
+	p_rx_rcr_ring_t		ring;
+	int			i;
+
+	channel = nxgep->pt_config.hw_config.start_rdc + rhp->index;
+	ring =  nxgep->rx_rcr_rings->rcr_rings[channel];
+
+	MUTEX_ENTER(&ring->lock);
+
+	if (nxgep->rx_channel_started[channel] == B_TRUE) {
+		MUTEX_EXIT(&ring->lock);
+		return (0);
+	}
+
+	/* set rcr_ring */
+	for (i = 0; i < nxgep->ldgvp->maxldvs; i++) {
+		if ((nxgep->ldgvp->ldvp[i].is_rxdma == 1) &&
+		    (nxgep->ldgvp->ldvp[i].channel == channel)) {
+			ring->ldvp = &nxgep->ldgvp->ldvp[i];
+			ring->ldgp = nxgep->ldgvp->ldvp[i].ldgp;
+		}
+	}
+
+	nxgep->rx_channel_started[channel] = B_TRUE;
+	ring->rcr_mac_handle = rhp->ring_handle;
+	ring->rcr_gen_num = mr_gen_num;
+	MUTEX_EXIT(&ring->lock);
+
+	return (0);
+}
+
+static void
+nxge_rx_ring_stop(mac_ring_driver_t rdriver)
+{
+	p_nxge_ring_handle_t	rhp = (p_nxge_ring_handle_t)rdriver;
+	p_nxge_t		nxgep = rhp->nxgep;
+	uint32_t		channel;
+	p_rx_rcr_ring_t		ring;
+
+	channel = nxgep->pt_config.hw_config.start_rdc + rhp->index;
+	ring =  nxgep->rx_rcr_rings->rcr_rings[channel];
+
+	MUTEX_ENTER(&ring->lock);
+	nxgep->rx_channel_started[channel] = B_FALSE;
+	ring->rcr_mac_handle = NULL;
+	MUTEX_EXIT(&ring->lock);
+}
+
+/*
+ * Callback funtion for MAC layer to register all rings.
+ */
+static void
+nxge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
+    const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+	p_nxge_t		nxgep = (p_nxge_t)arg;
+	p_nxge_hw_pt_cfg_t	p_cfgp = &nxgep->pt_config.hw_config;
+
+	NXGE_DEBUG_MSG((nxgep, TX_CTL,
+	    "==> nxge_fill_ring 0x%x index %d", rtype, index));
+
+	switch (rtype) {
+	case MAC_RING_TYPE_TX: {
+		p_nxge_ring_handle_t	rhandlep;
+
+		NXGE_DEBUG_MSG((nxgep, TX_CTL,
+		    "==> nxge_fill_ring (TX) 0x%x index %d ntdcs %d",
+		    rtype, index, p_cfgp->tdc.count));
+
+		ASSERT((index >= 0) && (index < p_cfgp->tdc.count));
+		rhandlep = &nxgep->tx_ring_handles[index];
+		rhandlep->nxgep = nxgep;
+		rhandlep->index = index;
+		rhandlep->ring_handle = rh;
+
+		infop->mri_driver = (mac_ring_driver_t)rhandlep;
+		infop->mri_start = nxge_tx_ring_start;
+		infop->mri_stop = nxge_tx_ring_stop;
+		infop->mri_tx = nxge_tx_ring_send;
+
+		break;
+	}
+	case MAC_RING_TYPE_RX: {
+		p_nxge_ring_handle_t	rhandlep;
+		int			nxge_rindex;
+		mac_intr_t		nxge_mac_intr;
+
+		NXGE_DEBUG_MSG((nxgep, RX_CTL,
+		    "==> nxge_fill_ring (RX) 0x%x index %d nrdcs %d",
+		    rtype, index, p_cfgp->max_rdcs));
+
+		/*
+		 * 'index' is the ring index within the group.
+		 * Find the ring index in the nxge instance.
+		 */
+		nxge_rindex = nxge_get_rxring_index(nxgep, rg_index, index);
+
+		ASSERT((nxge_rindex >= 0) && (nxge_rindex < p_cfgp->max_rdcs));
+		rhandlep = &nxgep->rx_ring_handles[nxge_rindex];
+		rhandlep->nxgep = nxgep;
+		rhandlep->index = nxge_rindex;
+		rhandlep->ring_handle = rh;
+
+		/*
+		 * Entrypoint to enable interrupt (disable poll) and
+		 * disable interrupt (enable poll).
+		 */
+		nxge_mac_intr.mi_handle = (mac_intr_handle_t)rhandlep;
+		nxge_mac_intr.mi_enable = (mac_intr_enable_t)nxge_disable_poll;
+		nxge_mac_intr.mi_disable = (mac_intr_disable_t)nxge_enable_poll;
+		infop->mri_driver = (mac_ring_driver_t)rhandlep;
+		infop->mri_start = nxge_rx_ring_start;
+		infop->mri_stop = nxge_rx_ring_stop;
+		infop->mri_intr = nxge_mac_intr; /* ??? */
+		infop->mri_poll = nxge_rx_poll;
+
+		break;
+	}
+	default:
+		break;
+	}
+
+	NXGE_DEBUG_MSG((nxgep, DDI_CTL, "<== nxge_fill_ring 0x%x",
+	    rtype));
+}
+
+static void
+nxge_group_add_ring(mac_group_driver_t gh, mac_ring_driver_t rh,
+    mac_ring_type_t type)
+{
+	nxge_ring_group_t	*rgroup = (nxge_ring_group_t *)gh;
+	nxge_ring_handle_t	*rhandle = (nxge_ring_handle_t *)rh;
+	nxge_t			*nxge;
+	nxge_grp_t		*grp;
+	nxge_rdc_grp_t		*rdc_grp;
+	uint16_t		channel;	/* device-wise ring id */
+	int			dev_gindex;
+	int			rv;
+
+	nxge = rgroup->nxgep;
+
+	switch (type) {
+	case MAC_RING_TYPE_TX:
+		/*
+		 * nxge_grp_dc_add takes a channel number which is a
+		 * "devise" ring ID.
+		 */
+		channel = nxge->pt_config.hw_config.tdc.start + rhandle->index;
+
+		/*
+		 * Remove the ring from the default group
+		 */
+		if (rgroup->gindex != 0) {
+			(void) nxge_grp_dc_remove(nxge, VP_BOUND_TX, channel);
+		}
+
+		/*
+		 * nxge->tx_set.group[] is an array of groups indexed by
+		 * a "port" group ID.
+		 */
+		grp = nxge->tx_set.group[rgroup->gindex];
+		rv = nxge_grp_dc_add(nxge, grp, VP_BOUND_TX, channel);
+		if (rv != 0) {
+			NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
+			    "nxge_group_add_ring: nxge_grp_dc_add failed"));
+		}
+		break;
+
+	case MAC_RING_TYPE_RX:
+		/*
+		 * nxge->rx_set.group[] is an array of groups indexed by
+		 * a "port" group ID.
+		 */
+		grp = nxge->rx_set.group[rgroup->gindex];
+
+		dev_gindex = nxge->pt_config.hw_config.def_mac_rxdma_grpid +
+		    rgroup->gindex;
+		rdc_grp = &nxge->pt_config.rdc_grps[dev_gindex];
+
+		/*
+		 * nxge_grp_dc_add takes a channel number which is a
+		 * "devise" ring ID.
+		 */
+		channel = nxge->pt_config.hw_config.start_rdc + rhandle->index;
+		rv = nxge_grp_dc_add(nxge, grp, VP_BOUND_RX, channel);
+		if (rv != 0) {
+			NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL,
+			    "nxge_group_add_ring: nxge_grp_dc_add failed"));
+		}
+
+		rdc_grp->map |= (1 << channel);
+		rdc_grp->max_rdcs++;
+
+		(void) nxge_init_fzc_rdc_tbl(nxge, rgroup->rdctbl);
+		break;
+	}
+}
+
+static void
+nxge_group_rem_ring(mac_group_driver_t gh, mac_ring_driver_t rh,
+    mac_ring_type_t type)
+{
+	nxge_ring_group_t	*rgroup = (nxge_ring_group_t *)gh;
+	nxge_ring_handle_t	*rhandle = (nxge_ring_handle_t *)rh;
+	nxge_t			*nxge;
+	uint16_t		channel;	/* device-wise ring id */
+	nxge_rdc_grp_t		*rdc_grp;
+	int			dev_gindex;
+
+	nxge = rgroup->nxgep;
+
+	switch (type) {
+	case MAC_RING_TYPE_TX:
+		dev_gindex = nxge->pt_config.hw_config.def_mac_txdma_grpid +
+		    rgroup->gindex;
+		channel = nxge->pt_config.hw_config.tdc.start + rhandle->index;
+		nxge_grp_dc_remove(nxge, VP_BOUND_TX, channel);
+
+		/*
+		 * Add the ring back to the default group
+		 */
+		if (rgroup->gindex != 0) {
+			nxge_grp_t *grp;
+			grp = nxge->tx_set.group[0];
+			(void) nxge_grp_dc_add(nxge, grp, VP_BOUND_TX, channel);
+		}
+		break;
+
+	case MAC_RING_TYPE_RX:
+		dev_gindex = nxge->pt_config.hw_config.def_mac_rxdma_grpid +
+		    rgroup->gindex;
+		rdc_grp = &nxge->pt_config.rdc_grps[dev_gindex];
+		channel = rdc_grp->start_rdc + rhandle->index;
+		nxge_grp_dc_remove(nxge, VP_BOUND_RX, channel);
+
+		rdc_grp->map &= ~(1 << channel);
+		rdc_grp->max_rdcs--;
+
+		(void) nxge_init_fzc_rdc_tbl(nxge, rgroup->rdctbl);
+		break;
+	}
+}
+
+
+/*ARGSUSED*/
 static nxge_status_t
 nxge_add_intrs(p_nxge_t nxgep)
 {
@@ -5818,33 +5807,6 @@ nxge_add_intrs(p_nxge_t nxgep)
 	return (status);
 }
 
-/*ARGSUSED*/
-static nxge_status_t
-nxge_add_soft_intrs(p_nxge_t nxgep)
-{
-
-	int		ddi_status = DDI_SUCCESS;
-	nxge_status_t	status = NXGE_OK;
-
-	NXGE_DEBUG_MSG((nxgep, DDI_CTL, "==> nxge_add_soft_intrs"));
-
-	nxgep->resched_id = NULL;
-	nxgep->resched_running = B_FALSE;
-	ddi_status = ddi_add_softintr(nxgep->dip, DDI_SOFTINT_LOW,
-	    &nxgep->resched_id,
-	    NULL, NULL, nxge_reschedule, (caddr_t)nxgep);
-	if (ddi_status != DDI_SUCCESS) {
-		NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL, "<== nxge_add_soft_intrs: "
-		    "ddi_add_softintrs failed: status 0x%08x",
-		    ddi_status));
-		return (NXGE_ERROR | NXGE_DDI_FAILED);
-	}
-
-	NXGE_DEBUG_MSG((nxgep, DDI_CTL, "<== nxge_ddi_add_soft_intrs"));
-
-	return (status);
-}
-
 static nxge_status_t
 nxge_add_intrs_adv(p_nxge_t nxgep)
 {
@@ -6277,21 +6239,6 @@ nxge_remove_intrs(p_nxge_t nxgep)
 
 /*ARGSUSED*/
 static void
-nxge_remove_soft_intrs(p_nxge_t nxgep)
-{
-	NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_remove_soft_intrs"));
-	if (nxgep->resched_id) {
-		ddi_remove_softintr(nxgep->resched_id);
-		NXGE_DEBUG_MSG((nxgep, INT_CTL,
-		    "==> nxge_remove_soft_intrs: removed"));
-		nxgep->resched_id = NULL;
-	}
-
-	NXGE_DEBUG_MSG((nxgep, INT_CTL, "<== nxge_remove_soft_intrs"));
-}
-
-/*ARGSUSED*/
-static void
 nxge_intrs_enable(p_nxge_t nxgep)
 {
 	p_nxge_intr_t	intrp;
@@ -6389,6 +6336,7 @@ nxge_mac_register(p_nxge_t nxgep)
 	macp->m_margin = VLAN_TAGSZ;
 	macp->m_priv_props = nxge_priv_props;
 	macp->m_priv_prop_count = NXGE_MAX_PRIV_PROPS;
+	macp->m_v12n = MAC_VIRT_HIO | MAC_VIRT_LEVEL1 | MAC_VIRT_SERIALIZE;
 
 	NXGE_DEBUG_MSG((nxgep, MAC_CTL,
 	    "==> nxge_mac_register: instance %d "
@@ -6941,7 +6889,7 @@ nxge_niu_peu_reset(p_nxge_t nxgep)
 static void
 nxge_set_pci_replay_timeout(p_nxge_t nxgep)
 {
-	p_dev_regs_t 	dev_regs;
+	p_dev_regs_t	dev_regs;
 	uint32_t	value;
 
 	NXGE_DEBUG_MSG((nxgep, DDI_CTL, "==> nxge_set_pci_replay_timeout"));
diff --git a/usr/src/uts/common/io/nxge/nxge_ndd.c b/usr/src/uts/common/io/nxge/nxge_ndd.c
index 90c8128428..38bf3d5969 100644
--- a/usr/src/uts/common/io/nxge/nxge_ndd.c
+++ b/usr/src/uts/common/io/nxge/nxge_ndd.c
@@ -980,15 +980,13 @@ nxge_param_get_txdma_info(p_nxge_t nxgep, queue_t *q, p_mblk_t mp, caddr_t cp)
 	mp->b_cont = np;
 	print_len = 0;
 
-	((mblk_t *)np)->b_wptr += print_len;
-	buf_len -= print_len;
 	print_len = snprintf((char *)((mblk_t *)np)->b_wptr, buf_len,
 	    "TDC\t HW TDC\t\n");
 	((mblk_t *)np)->b_wptr += print_len;
 	buf_len -= print_len;
 
 	set = &nxgep->tx_set;
-	for (tdc = 0; tdc < NXGE_MAX_RDCS; tdc++) {
+	for (tdc = 0; tdc < NXGE_MAX_TDCS; tdc++) {
 		if ((1 << tdc) & set->owned.map) {
 			print_len = snprintf((char *)((mblk_t *)np)->b_wptr,
 			    buf_len, "%d\n", tdc);
diff --git a/usr/src/uts/common/io/nxge/nxge_rxdma.c b/usr/src/uts/common/io/nxge/nxge_rxdma.c
index e0e81491c6..8aeb88f7c5 100644
--- a/usr/src/uts/common/io/nxge/nxge_rxdma.c
+++ b/usr/src/uts/common/io/nxge/nxge_rxdma.c
@@ -39,6 +39,13 @@
 	(rdc + nxgep->pt_config.hw_config.start_rdc)
 
 /*
+ * XXX: This is a tunable to limit the number of packets each interrupt
+ * handles.  0 (default) means that each interrupt takes as much packets
+ * as it finds.
+ */
+extern int	nxge_max_intr_pkts;
+
+/*
  * Globals: tunable parameters (/etc/system or adb)
  *
  */
@@ -115,7 +122,7 @@ nxge_status_t nxge_disable_rxdma_channel(p_nxge_t, uint16_t);
 
 static p_rx_msg_t nxge_allocb(size_t, uint32_t, p_nxge_dma_common_t);
 static void nxge_freeb(p_rx_msg_t);
-static void nxge_rx_pkts_vring(p_nxge_t, uint_t, rx_dma_ctl_stat_t);
+static mblk_t *nxge_rx_pkts_vring(p_nxge_t, uint_t, rx_dma_ctl_stat_t);
 static nxge_status_t nxge_rx_err_evnts(p_nxge_t, int, rx_dma_ctl_stat_t);
 
 static nxge_status_t nxge_rxdma_handle_port_errors(p_nxge_t,
@@ -137,8 +144,10 @@ nxge_status_t
 nxge_init_rxdma_channels(p_nxge_t nxgep)
 {
 	nxge_grp_set_t	*set = &nxgep->rx_set;
-	int		i, count, rdc, channel;
+	int		i, count, channel;
 	nxge_grp_t	*group;
+	dc_map_t	map;
+	int		dev_gindex;
 
 	NXGE_DEBUG_MSG((nxgep, MEM2_CTL, "==> nxge_init_rxdma_channels"));
 
@@ -158,9 +167,11 @@ nxge_init_rxdma_channels(p_nxge_t nxgep)
 	for (i = 0, count = 0; i < NXGE_LOGICAL_GROUP_MAX; i++) {
 		if ((1 << i) & set->lg.map) {
 			group = set->group[i];
-
+			dev_gindex =
+			    nxgep->pt_config.hw_config.def_mac_rxdma_grpid + i;
+			map = nxgep->pt_config.rdc_grps[dev_gindex].map;
 			for (channel = 0; channel < NXGE_MAX_RDCS; channel++) {
-				if ((1 << channel) & group->map) {
+				if ((1 << channel) & map) {
 					if ((nxge_grp_dc_add(nxgep,
 					    group, VP_BOUND_RX, channel)))
 						goto init_rxdma_channels_exit;
@@ -178,15 +189,16 @@ init_rxdma_channels_exit:
 	for (i = 0, count = 0; i < NXGE_LOGICAL_GROUP_MAX; i++) {
 		if ((1 << i) & set->lg.map) {
 			group = set->group[i];
-
-			for (rdc = 0; rdc < NXGE_MAX_RDCS; rdc++) {
-				if ((1 << rdc) & group->map) {
+			dev_gindex =
+			    nxgep->pt_config.hw_config.def_mac_rxdma_grpid + i;
+			map = nxgep->pt_config.rdc_grps[dev_gindex].map;
+			for (channel = 0; channel < NXGE_MAX_RDCS; channel++) {
+				if ((1 << channel) & map) {
 					nxge_grp_dc_remove(nxgep,
-					    VP_BOUND_RX, rdc);
+					    VP_BOUND_RX, channel);
 				}
 			}
 		}
-
 		if (++count == set->lg.count)
 			break;
 	}
@@ -1175,35 +1187,6 @@ nxge_rxdma_regs_dump(p_nxge_t nxgep, int rdc)
 	    "<== nxge_rxdma_regs_dump: rdc rdc %d", rdc));
 }
 
-void
-nxge_rxdma_stop(p_nxge_t nxgep)
-{
-	NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rxdma_stop"));
-
-	(void) nxge_link_monitor(nxgep, LINK_MONITOR_STOP);
-	(void) nxge_rx_mac_disable(nxgep);
-	(void) nxge_rxdma_hw_mode(nxgep, NXGE_DMA_STOP);
-	NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rxdma_stop"));
-}
-
-void
-nxge_rxdma_stop_reinit(p_nxge_t nxgep)
-{
-	NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rxdma_stop_reinit"));
-
-	(void) nxge_rxdma_stop(nxgep);
-	(void) nxge_uninit_rxdma_channels(nxgep);
-	(void) nxge_init_rxdma_channels(nxgep);
-
-#ifndef	AXIS_DEBUG_LB
-	(void) nxge_xcvr_init(nxgep);
-	(void) nxge_link_monitor(nxgep, LINK_MONITOR_START);
-#endif
-	(void) nxge_rx_mac_enable(nxgep);
-
-	NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rxdma_stop_reinit"));
-}
-
 nxge_status_t
 nxge_rxdma_hw_mode(p_nxge_t nxgep, boolean_t enable)
 {
@@ -1438,11 +1421,53 @@ nxge_rxdma_fixup_channel_fail:
 	NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rxdma_fixup_channel"));
 }
 
-/* ARGSUSED */
+/*
+ * Convert an absolute RDC number to a Receive Buffer Ring index.  That is,
+ * map <channel> to an index into nxgep->rx_rbr_rings.
+ * (device ring index -> port ring index)
+ */
 int
 nxge_rxdma_get_ring_index(p_nxge_t nxgep, uint16_t channel)
 {
-	return (channel);
+	int			i, ndmas;
+	uint16_t		rdc;
+	p_rx_rbr_rings_t	rx_rbr_rings;
+	p_rx_rbr_ring_t		*rbr_rings;
+
+	NXGE_DEBUG_MSG((nxgep, RX_CTL,
+	    "==> nxge_rxdma_get_ring_index: channel %d", channel));
+
+	rx_rbr_rings = nxgep->rx_rbr_rings;
+	if (rx_rbr_rings == NULL) {
+		NXGE_DEBUG_MSG((nxgep, RX_CTL,
+		    "<== nxge_rxdma_get_ring_index: NULL ring pointer"));
+		return (-1);
+	}
+	ndmas = rx_rbr_rings->ndmas;
+	if (!ndmas) {
+		NXGE_DEBUG_MSG((nxgep, RX_CTL,
+		    "<== nxge_rxdma_get_ring_index: no channel"));
+		return (-1);
+	}
+
+	NXGE_DEBUG_MSG((nxgep, RX_CTL,
+	    "==> nxge_rxdma_get_ring_index (ndmas %d)", ndmas));
+
+	rbr_rings = rx_rbr_rings->rbr_rings;
+	for (i = 0; i < ndmas; i++) {
+		rdc = rbr_rings[i]->rdc;
+		if (channel == rdc) {
+			NXGE_DEBUG_MSG((nxgep, RX_CTL,
+			    "==> nxge_rxdma_get_rbr_ring: channel %d "
+			    "(index %d) ring %d", channel, i, rbr_rings[i]));
+			return (i);
+		}
+	}
+
+	NXGE_DEBUG_MSG((nxgep, RX_CTL,
+	    "<== nxge_rxdma_get_rbr_ring_index: not found"));
+
+	return (-1);
 }
 
 p_rx_rbr_ring_t
@@ -1792,11 +1817,12 @@ nxge_rx_intr(void *arg1, void *arg2)
 	uint8_t			channel;
 	npi_handle_t		handle;
 	rx_dma_ctl_stat_t	cs;
+	p_rx_rcr_ring_t		rcr_ring;
+	mblk_t			*mp;
 
 #ifdef	NXGE_DEBUG
 	rxdma_cfig1_t		cfg;
 #endif
-	uint_t 			serviced = DDI_INTR_UNCLAIMED;
 
 	if (ldvp == NULL) {
 		NXGE_DEBUG_MSG((NULL, INT_CTL,
@@ -1826,11 +1852,37 @@ nxge_rx_intr(void *arg1, void *arg2)
 	 * receive dma channel.
 	 */
 	handle = NXGE_DEV_NPI_HANDLE(nxgep);
+
+	rcr_ring = nxgep->rx_rcr_rings->rcr_rings[ldvp->vdma_index];
+
+	/*
+	 * The RCR ring lock must be held when packets
+	 * are being processed and the hardware registers are
+	 * being read or written to prevent race condition
+	 * among the interrupt thread, the polling thread
+	 * (will cause fatal errors such as rcrincon bit set)
+	 * and the setting of the poll_flag.
+	 */
+	MUTEX_ENTER(&rcr_ring->lock);
+
 	/*
 	 * Get the control and status for this channel.
 	 */
 	channel = ldvp->channel;
 	ldgp = ldvp->ldgp;
+
+	if (!isLDOMguest(nxgep)) {
+		if (!nxgep->rx_channel_started[channel]) {
+			NXGE_DEBUG_MSG((nxgep, INT_CTL,
+			    "<== nxge_rx_intr: channel is not started"));
+			MUTEX_EXIT(&rcr_ring->lock);
+			return (DDI_INTR_CLAIMED);
+		}
+	}
+
+	ASSERT(rcr_ring->ldgp == ldgp);
+	ASSERT(rcr_ring->ldvp == ldvp);
+
 	RXDMA_REG_READ64(handle, RX_DMA_CTL_STAT_REG, channel, &cs.value);
 
 	NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rx_intr:channel %d "
@@ -1840,15 +1892,13 @@ nxge_rx_intr(void *arg1, void *arg2)
 	    cs.bits.hdw.rcrto,
 	    cs.bits.hdw.rcrthres));
 
-	nxge_rx_pkts_vring(nxgep, ldvp->vdma_index, cs);
-	serviced = DDI_INTR_CLAIMED;
+	mp = nxge_rx_pkts_vring(nxgep, ldvp->vdma_index, cs);
 
 	/* error events. */
 	if (cs.value & RX_DMA_CTL_STAT_ERROR) {
 		(void) nxge_rx_err_evnts(nxgep, channel, cs);
 	}
 
-nxge_intr_exit:
 	/*
 	 * Enable the mailbox update interrupt if we want
 	 * to use mailbox. We probably don't need to use
@@ -1856,40 +1906,82 @@ nxge_intr_exit:
 	 * Also write 1 to rcrthres and rcrto to clear
 	 * these two edge triggered bits.
 	 */
-
 	cs.value &= RX_DMA_CTL_STAT_WR1C;
-	cs.bits.hdw.mex = 1;
+	cs.bits.hdw.mex = rcr_ring->poll_flag ? 0 : 1;
 	RXDMA_REG_WRITE64(handle, RX_DMA_CTL_STAT_REG, channel,
 	    cs.value);
 
 	/*
-	 * Rearm this logical group if this is a single device
-	 * group.
+	 * If the polling mode is enabled, disable the interrupt.
 	 */
-	if (ldgp->nldvs == 1) {
-		ldgimgm_t		mgm;
-		mgm.value = 0;
-		mgm.bits.ldw.arm = 1;
-		mgm.bits.ldw.timer = ldgp->ldg_timer;
-		if (isLDOMguest(nxgep)) {
-			nxge_hio_ldgimgn(nxgep, ldgp);
-		} else {
+	if (rcr_ring->poll_flag) {
+		NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+		    "==> nxge_rx_intr: rdc %d ldgp $%p ldvp $%p "
+		    "(disabling interrupts)", channel, ldgp, ldvp));
+		/*
+		 * Disarm this logical group if this is a single device
+		 * group.
+		 */
+		if (ldgp->nldvs == 1) {
+			ldgimgm_t mgm;
+			mgm.value = 0;
+			mgm.bits.ldw.arm = 0;
 			NXGE_REG_WR64(handle,
-			    LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg),
-			    mgm.value);
+			    LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), mgm.value);
+		}
+	} else {
+		/*
+		 * Rearm this logical group if this is a single device group.
+		 */
+		if (ldgp->nldvs == 1) {
+			if (isLDOMguest(nxgep)) {
+				nxge_hio_ldgimgn(nxgep, ldgp);
+			} else {
+				ldgimgm_t mgm;
+
+				mgm.value = 0;
+				mgm.bits.ldw.arm = 1;
+				mgm.bits.ldw.timer = ldgp->ldg_timer;
+
+				NXGE_REG_WR64(handle,
+				    LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg),
+				    mgm.value);
+			}
 		}
+
+		NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+		    "==> nxge_rx_intr: rdc %d ldgp $%p "
+		    "exiting ISR (and call mac_rx_ring)", channel, ldgp));
 	}
+	MUTEX_EXIT(&rcr_ring->lock);
 
-	NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rx_intr: serviced %d",
-	    serviced));
-	return (serviced);
+	if (mp) {
+		if (!isLDOMguest(nxgep))
+			mac_rx_ring(nxgep->mach, rcr_ring->rcr_mac_handle, mp,
+			    rcr_ring->rcr_gen_num);
+#if defined(sun4v)
+		else {			/* isLDOMguest(nxgep) */
+			nxge_hio_data_t *nhd = (nxge_hio_data_t *)
+			    nxgep->nxge_hw_p->hio;
+			nx_vio_fp_t *vio = &nhd->hio.vio;
+
+			if (vio->cb.vio_net_rx_cb) {
+				(*vio->cb.vio_net_rx_cb)
+				    (nxgep->hio_vr->vhp, mp);
+			}
+		}
+#endif
+	}
+	NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rx_intr: DDI_INTR_CLAIMED"));
+	return (DDI_INTR_CLAIMED);
 }
 
 /*
  * Process the packets received in the specified logical device
  * and pass up a chain of message blocks to the upper layer.
+ * The RCR ring lock must be held before calling this function.
  */
-static void
+static mblk_t *
 nxge_rx_pkts_vring(p_nxge_t nxgep, uint_t vindex, rx_dma_ctl_stat_t cs)
 {
 	p_mblk_t		mp;
@@ -1897,15 +1989,14 @@ nxge_rx_pkts_vring(p_nxge_t nxgep, uint_t vindex, rx_dma_ctl_stat_t cs)
 
 	NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rx_pkts_vring"));
 	rcrp = nxgep->rx_rcr_rings->rcr_rings[vindex];
-	if (rcrp->poll_flag) {
-		/* It is in the poll mode */
-		return;
-	}
 
+	NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+	    "==> nxge_rx_pkts_vring: (calling nxge_rx_pkts)rdc %d "
+	    "rcr_mac_handle $%p ", rcrp->rdc, rcrp->rcr_mac_handle));
 	if ((mp = nxge_rx_pkts(nxgep, rcrp, cs, -1)) == NULL) {
 		NXGE_DEBUG_MSG((nxgep, RX_CTL,
 		    "<== nxge_rx_pkts_vring: no mp"));
-		return;
+		return (NULL);
 	}
 
 	NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rx_pkts_vring: $%p",
@@ -1947,21 +2038,11 @@ nxge_rx_pkts_vring(p_nxge_t nxgep, uint_t vindex, rx_dma_ctl_stat_t cs)
 			    mp->b_next->b_wptr - mp->b_next->b_rptr)));
 		}
 #endif
+	NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+	    "<== nxge_rx_pkts_vring: returning rdc %d rcr_mac_handle $%p ",
+	    rcrp->rdc, rcrp->rcr_mac_handle));
 
-	if (!isLDOMguest(nxgep))
-		mac_rx(nxgep->mach, rcrp->rcr_mac_handle, mp);
-#if defined(sun4v)
-	else {			/* isLDOMguest(nxgep) */
-		nxge_hio_data_t *nhd = (nxge_hio_data_t *)
-		    nxgep->nxge_hw_p->hio;
-		nx_vio_fp_t *vio = &nhd->hio.vio;
-
-		if (vio->cb.vio_net_rx_cb) {
-			(*vio->cb.vio_net_rx_cb)
-			    (nxgep->hio_vr->vhp, mp);
-		}
-	}
-#endif
+	return (mp);
 }
 
 
@@ -1978,6 +2059,7 @@ nxge_rx_pkts_vring(p_nxge_t nxgep, uint_t vindex, rx_dma_ctl_stat_t cs)
  * a hardware control status register will be updated with the number of
  * packets were removed from the hardware queue.
  *
+ * The RCR ring lock is held when entering this function.
  */
 static mblk_t *
 nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs,
@@ -1998,7 +2080,7 @@ nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs,
 	npi_status_t		rs = NPI_SUCCESS;
 #endif
 
-	NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rx_pkts: "
+	NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, "==> nxge_rx_pkts: "
 	    "channel %d", rcr_p->rdc));
 
 	if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) {
@@ -2032,7 +2114,7 @@ nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs,
 
 
 	if (!qlen) {
-		NXGE_DEBUG_MSG((nxgep, RX_CTL,
+		NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
 		    "==> nxge_rx_pkts:rcr channel %d "
 		    "qlen %d (no pkts)", channel, qlen));
 
@@ -2140,6 +2222,13 @@ nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs,
 		    (totallen >= bytes_to_pickup)) {
 			break;
 		}
+
+		/* limit the number of packets for interrupt */
+		if (!(rcr_p->poll_flag)) {
+			if (npkt_read == nxge_max_intr_pkts) {
+				break;
+			}
+		}
 	}
 
 	rcr_p->rcr_desc_rd_head_pp = rcr_desc_rd_head_pp;
@@ -2174,7 +2263,9 @@ nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs,
 	 * read.
 	 */
 
-	NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rx_pkts"));
+	NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, "<== nxge_rx_pkts: return"
+	    "channel %d", rcr_p->rdc));
+
 	return (head_mp);
 }
 
@@ -2280,7 +2371,7 @@ nxge_receive_packet(p_nxge_t nxgep,
 	}
 
 	/*
-	 * Sofware workaround for BMAC hardware limitation that allows
+	 * Software workaround for BMAC hardware limitation that allows
 	 * maxframe size of 1526, instead of 1522 for non-jumbo and 0x2406
 	 * instead of 0x2400 for jumbo.
 	 */
@@ -2318,7 +2409,6 @@ nxge_receive_packet(p_nxge_t nxgep,
 		    hdr_size));
 	}
 
-	MUTEX_ENTER(&rcr_p->lock);
 	MUTEX_ENTER(&rx_rbr_p->lock);
 
 	NXGE_DEBUG_MSG((nxgep, RX_CTL,
@@ -2344,7 +2434,6 @@ nxge_receive_packet(p_nxge_t nxgep,
 
 	if (status != NXGE_OK) {
 		MUTEX_EXIT(&rx_rbr_p->lock);
-		MUTEX_EXIT(&rcr_p->lock);
 		NXGE_DEBUG_MSG((nxgep, RX_CTL,
 		    "<== nxge_receive_packet: found vaddr failed %d",
 		    status));
@@ -2392,7 +2481,6 @@ nxge_receive_packet(p_nxge_t nxgep,
 		break;
 	default:
 		MUTEX_EXIT(&rx_rbr_p->lock);
-		MUTEX_EXIT(&rcr_p->lock);
 		return;
 	}
 
@@ -2558,7 +2646,6 @@ nxge_receive_packet(p_nxge_t nxgep,
 			}
 
 			MUTEX_EXIT(&rx_rbr_p->lock);
-			MUTEX_EXIT(&rcr_p->lock);
 			nxge_freeb(rx_msg_p);
 			return;
 		}
@@ -2643,7 +2730,6 @@ nxge_receive_packet(p_nxge_t nxgep,
 			rx_msg_p->free = B_TRUE;
 		}
 		MUTEX_EXIT(&rx_rbr_p->lock);
-		MUTEX_EXIT(&rcr_p->lock);
 		nxge_freeb(rx_msg_p);
 		return;
 	}
@@ -2657,7 +2743,6 @@ nxge_receive_packet(p_nxge_t nxgep,
 	rcr_p->rcvd_pkt_bytes = bytes_read;
 
 	MUTEX_EXIT(&rx_rbr_p->lock);
-	MUTEX_EXIT(&rcr_p->lock);
 
 	if (rx_msg_p->free && rx_msg_p->rx_use_bcopy) {
 		atomic_inc_32(&rx_msg_p->ref_cnt);
@@ -2682,8 +2767,6 @@ nxge_receive_packet(p_nxge_t nxgep,
 
 	if (is_valid && !multi) {
 		/*
-		 * Update hardware checksuming.
-		 *
 		 * If the checksum flag nxge_chksum_offload
 		 * is 1, TCP and UDP packets can be sent
 		 * up with good checksum. If the checksum flag
@@ -2727,6 +2810,177 @@ nxge_receive_packet(p_nxge_t nxgep,
 	    *multi_p, nmp, *mp, *mp_cont));
 }
 
+/*
+ * Enable polling for a ring. Interrupt for the ring is disabled when
+ * the nxge interrupt comes (see nxge_rx_intr).
+ */
+int
+nxge_enable_poll(void *arg)
+{
+	p_nxge_ring_handle_t	ring_handle = (p_nxge_ring_handle_t)arg;
+	p_rx_rcr_ring_t		ringp;
+	p_nxge_t		nxgep;
+	p_nxge_ldg_t		ldgp;
+	uint32_t		channel;
+
+	if (ring_handle == NULL) {
+		return (0);
+	}
+
+	nxgep = ring_handle->nxgep;
+	channel = nxgep->pt_config.hw_config.start_rdc + ring_handle->index;
+	ringp = nxgep->rx_rcr_rings->rcr_rings[channel];
+	NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+	    "==> nxge_enable_poll: rdc %d ", ringp->rdc));
+	ldgp = ringp->ldgp;
+	if (ldgp == NULL) {
+		NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+		    "==> nxge_enable_poll: rdc %d NULL ldgp: no change",
+		    ringp->rdc));
+		return (0);
+	}
+
+	MUTEX_ENTER(&ringp->lock);
+	/* enable polling */
+	if (ringp->poll_flag == 0) {
+		ringp->poll_flag = 1;
+		NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+		    "==> nxge_enable_poll: rdc %d set poll flag to 1",
+		    ringp->rdc));
+	}
+
+	MUTEX_EXIT(&ringp->lock);
+	return (0);
+}
+/*
+ * Disable polling for a ring and enable its interrupt.
+ */
+int
+nxge_disable_poll(void *arg)
+{
+	p_nxge_ring_handle_t	ring_handle = (p_nxge_ring_handle_t)arg;
+	p_rx_rcr_ring_t		ringp;
+	p_nxge_t		nxgep;
+	uint32_t		channel;
+
+	if (ring_handle == NULL) {
+		return (0);
+	}
+
+	nxgep = ring_handle->nxgep;
+	channel = nxgep->pt_config.hw_config.start_rdc + ring_handle->index;
+	ringp = nxgep->rx_rcr_rings->rcr_rings[channel];
+
+	NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+	    "==> nxge_disable_poll: rdc %d poll_flag %d", ringp->rdc));
+
+	MUTEX_ENTER(&ringp->lock);
+
+	/* disable polling: enable interrupt */
+	if (ringp->poll_flag) {
+		npi_handle_t		handle;
+		rx_dma_ctl_stat_t	cs;
+		uint8_t			channel;
+		p_nxge_ldg_t		ldgp;
+
+		/*
+		 * Get the control and status for this channel.
+		 */
+		handle = NXGE_DEV_NPI_HANDLE(nxgep);
+		channel = ringp->rdc;
+		RXDMA_REG_READ64(handle, RX_DMA_CTL_STAT_REG,
+		    channel, &cs.value);
+
+		/*
+		 * Enable mailbox update
+		 * Since packets were not read and the hardware uses
+		 * bits pktread and ptrread to update the queue
+		 * length, we need to set both bits to 0.
+		 */
+		cs.bits.ldw.pktread = 0;
+		cs.bits.ldw.ptrread = 0;
+		cs.bits.hdw.mex = 1;
+		RXDMA_REG_WRITE64(handle, RX_DMA_CTL_STAT_REG, channel,
+		    cs.value);
+
+		/*
+		 * Rearm this logical group if this is a single device
+		 * group.
+		 */
+		ldgp = ringp->ldgp;
+		if (ldgp == NULL) {
+			ringp->poll_flag = 0;
+			MUTEX_EXIT(&ringp->lock);
+			NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+			    "==> nxge_disable_poll: no ldgp rdc %d "
+			    "(still set poll to 0", ringp->rdc));
+			return (0);
+		}
+		NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+		    "==> nxge_disable_poll: rdc %d ldgp $%p (enable intr)",
+		    ringp->rdc, ldgp));
+		if (ldgp->nldvs == 1) {
+			ldgimgm_t	mgm;
+			mgm.value = 0;
+			mgm.bits.ldw.arm = 1;
+			mgm.bits.ldw.timer = ldgp->ldg_timer;
+			NXGE_REG_WR64(handle,
+			    LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), mgm.value);
+		}
+		ringp->poll_flag = 0;
+	}
+
+	MUTEX_EXIT(&ringp->lock);
+	return (0);
+}
+
+/*
+ * Poll 'bytes_to_pickup' bytes of message from the rx ring.
+ */
+mblk_t *
+nxge_rx_poll(void *arg, int bytes_to_pickup)
+{
+	p_nxge_ring_handle_t	ring_handle = (p_nxge_ring_handle_t)arg;
+	p_rx_rcr_ring_t		rcr_p;
+	p_nxge_t		nxgep;
+	npi_handle_t		handle;
+	rx_dma_ctl_stat_t	cs;
+	mblk_t			*mblk;
+	p_nxge_ldv_t		ldvp;
+	uint32_t		channel;
+
+	nxgep = ring_handle->nxgep;
+
+	/*
+	 * Get the control and status for this channel.
+	 */
+	handle = NXGE_DEV_NPI_HANDLE(nxgep);
+	channel = nxgep->pt_config.hw_config.start_rdc + ring_handle->index;
+	rcr_p = nxgep->rx_rcr_rings->rcr_rings[channel];
+	MUTEX_ENTER(&rcr_p->lock);
+	ASSERT(rcr_p->poll_flag == 1);
+
+	RXDMA_REG_READ64(handle, RX_DMA_CTL_STAT_REG, rcr_p->rdc, &cs.value);
+
+	NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+	    "==> nxge_rx_poll: calling nxge_rx_pkts: rdc %d poll_flag %d",
+	    rcr_p->rdc, rcr_p->poll_flag));
+	mblk = nxge_rx_pkts(nxgep, rcr_p, cs, bytes_to_pickup);
+
+	ldvp = rcr_p->ldvp;
+	/* error events. */
+	if (ldvp && (cs.value & RX_DMA_CTL_STAT_ERROR)) {
+		(void) nxge_rx_err_evnts(nxgep, ldvp->vdma_index, cs);
+	}
+
+	MUTEX_EXIT(&rcr_p->lock);
+
+	NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL,
+	    "<== nxge_rx_poll: rdc %d mblk $%p", rcr_p->rdc, mblk));
+	return (mblk);
+}
+
+
 /*ARGSUSED*/
 static nxge_status_t
 nxge_rx_err_evnts(p_nxge_t nxgep, int channel, rx_dma_ctl_stat_t cs)
@@ -4231,6 +4485,7 @@ nxge_rxdma_stop_channel(p_nxge_t nxgep, uint16_t channel)
 	 * Make sure channel is disabled.
 	 */
 	status = nxge_disable_rxdma_channel(nxgep, channel);
+
 	if (status != NXGE_OK) {
 		NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL,
 		    " nxge_rxdma_stop_channel: "
diff --git a/usr/src/uts/common/io/nxge/nxge_send.c b/usr/src/uts/common/io/nxge/nxge_send.c
index 7e656c9072..2b21d22a1c 100644
--- a/usr/src/uts/common/io/nxge/nxge_send.c
+++ b/usr/src/uts/common/io/nxge/nxge_send.c
@@ -40,8 +40,6 @@ static void nxge_hcksum_retrieve(mblk_t *,
     uint32_t *, uint32_t *);
 static uint32_t nxge_csgen(uint16_t *, int);
 
-extern void nxge_txdma_freemsg_task(p_tx_ring_t ringp);
-
 extern uint32_t		nxge_reclaim_pending;
 extern uint32_t 	nxge_bcopy_thresh;
 extern uint32_t 	nxge_dvma_thresh;
@@ -51,18 +49,116 @@ extern uint32_t		nxge_tx_intr_thres;
 extern uint32_t		nxge_tx_max_gathers;
 extern uint32_t		nxge_tx_tiny_pack;
 extern uint32_t		nxge_tx_use_bcopy;
-extern uint32_t		nxge_tx_lb_policy;
-extern uint32_t		nxge_no_tx_lb;
 extern nxge_tx_mode_t	nxge_tx_scheme;
 uint32_t		nxge_lso_kick_cnt = 2;
 
-typedef struct _mac_tx_hint {
-	uint16_t	sap;
-	uint16_t	vid;
-	void		*hash;
-} mac_tx_hint_t, *p_mac_tx_hint_t;
 
-int nxge_tx_lb_ring_1(p_mblk_t, uint32_t, p_mac_tx_hint_t);
+void
+nxge_tx_ring_task(void *arg)
+{
+	p_tx_ring_t	ring = (p_tx_ring_t)arg;
+
+	MUTEX_ENTER(&ring->lock);
+	(void) nxge_txdma_reclaim(ring->nxgep, ring, 0);
+	MUTEX_EXIT(&ring->lock);
+
+	if (!isLDOMguest(ring->nxgep) && !ring->tx_ring_offline)
+		mac_tx_ring_update(ring->nxgep->mach, ring->tx_ring_handle);
+#if defined(sun4v)
+	else {
+		nxge_hio_data_t *nhd =
+		    (nxge_hio_data_t *)ring->nxgep->nxge_hw_p->hio;
+		nx_vio_fp_t *vio = &nhd->hio.vio;
+
+		/* Call back vnet. */
+		if (vio->cb.vio_net_tx_update) {
+			(*vio->cb.vio_net_tx_update)(ring->nxgep->hio_vr->vhp);
+		}
+	}
+#endif
+}
+
+static void
+nxge_tx_ring_dispatch(p_tx_ring_t ring)
+{
+	/*
+	 * Kick the ring task to reclaim some buffers.
+	 */
+	(void) ddi_taskq_dispatch(ring->taskq,
+	    nxge_tx_ring_task, (void *)ring, DDI_SLEEP);
+}
+
+mblk_t *
+nxge_tx_ring_send(void *arg, mblk_t *mp)
+{
+	p_nxge_ring_handle_t	nrhp = (p_nxge_ring_handle_t)arg;
+	p_nxge_t		nxgep;
+	p_tx_ring_t		tx_ring_p;
+	int			status, channel;
+
+	ASSERT(nrhp != NULL);
+	nxgep = nrhp->nxgep;
+	channel = nxgep->pt_config.hw_config.tdc.start + nrhp->index;
+	tx_ring_p = nxgep->tx_rings->rings[channel];
+
+	ASSERT(nxgep == tx_ring_p->nxgep);
+
+#ifdef DEBUG
+	if (isLDOMservice(nxgep)) {
+		ASSERT(!tx_ring_p->tx_ring_offline);
+	}
+#endif
+
+	status = nxge_start(nxgep, tx_ring_p, mp);
+	if (status) {
+		nxge_tx_ring_dispatch(tx_ring_p);
+		return (mp);
+	}
+
+	return ((mblk_t *)NULL);
+}
+
+#if defined(sun4v)
+
+/*
+ * nxge_m_tx() is needed for Hybrid I/O operation of the vnet in
+ *	the guest domain.  See CR 6778758 for long term solution.
+ */
+
+mblk_t *
+nxge_m_tx(void *arg, mblk_t *mp)
+{
+	p_nxge_t		nxgep = (p_nxge_t)arg;
+	mblk_t			*next;
+	p_tx_ring_t		tx_ring_p;
+	int			status;
+
+	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_m_tx"));
+
+	/*
+	 * Get the default ring handle.
+	 */
+	tx_ring_p = nxgep->tx_rings->rings[0];
+
+	while (mp != NULL) {
+		next = mp->b_next;
+		mp->b_next = NULL;
+
+		status = nxge_start(nxgep, tx_ring_p, mp);
+		if (status != 0) {
+			mp->b_next = next;
+			nxge_tx_ring_dispatch(tx_ring_p);
+			return (mp);
+		}
+
+		mp = next;
+	}
+
+	NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_m_tx"));
+	return ((mblk_t *)NULL);
+}
+
+#endif
 
 int
 nxge_start(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, p_mblk_t mp)
@@ -305,8 +401,6 @@ start_again:
 				    tx_ring_p->tdc));
 				goto nxge_start_fail_lso;
 			} else {
-				boolean_t skip_sched = B_FALSE;
-
 				cas32((uint32_t *)&tx_ring_p->queueing, 0, 1);
 				tdc_stats->tx_no_desc++;
 
@@ -316,16 +410,10 @@ start_again:
 						(void) atomic_swap_32(
 						    &tx_ring_p->tx_ring_offline,
 						    NXGE_TX_RING_OFFLINED);
-						skip_sched = B_TRUE;
 					}
 				}
 
 				MUTEX_EXIT(&tx_ring_p->lock);
-				if (nxgep->resched_needed &&
-				    !nxgep->resched_running && !skip_sched) {
-					nxgep->resched_running = B_TRUE;
-					ddi_trigger_softintr(nxgep->resched_id);
-				}
 				status = 1;
 				goto nxge_start_fail1;
 			}
@@ -1012,10 +1100,7 @@ nxge_start_control_header_only:
 
 	MUTEX_EXIT(&tx_ring_p->lock);
 
-	nxge_txdma_freemsg_task(tx_ring_p);
-
 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_start"));
-
 	return (status);
 
 nxge_start_fail_lso:
@@ -1105,8 +1190,6 @@ nxge_start_fail2:
 			    tx_ring_p->tx_wrap_mask);
 
 		}
-
-		nxgep->resched_needed = B_TRUE;
 	}
 
 	if (isLDOMservice(nxgep)) {
@@ -1123,300 +1206,9 @@ nxge_start_fail1:
 	/* Add FMA to check the access handle nxge_hregh */
 
 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_start"));
-
-	return (status);
-}
-
-int
-nxge_serial_tx(mblk_t *mp, void *arg)
-{
-	p_tx_ring_t		tx_ring_p = (p_tx_ring_t)arg;
-	p_nxge_t		nxgep = tx_ring_p->nxgep;
-	int			status = 0;
-
-	if (isLDOMservice(nxgep)) {
-		if (tx_ring_p->tx_ring_offline) {
-			freemsg(mp);
-			return (status);
-		}
-	}
-
-	status = nxge_start(nxgep, tx_ring_p, mp);
 	return (status);
 }
 
-boolean_t
-nxge_send(p_nxge_t nxgep, mblk_t *mp, p_mac_tx_hint_t hp)
-{
-	p_tx_ring_t 		*tx_rings;
-	uint8_t			ring_index;
-	p_tx_ring_t		tx_ring_p;
-	nxge_grp_t		*group;
-
-	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_send"));
-
-	ASSERT(mp->b_next == NULL);
-
-	group = nxgep->tx_set.group[0];	/* The default group */
-	ring_index = nxge_tx_lb_ring_1(mp, group->count, hp);
-
-	tx_rings = nxgep->tx_rings->rings;
-	tx_ring_p = tx_rings[group->legend[ring_index]];
-
-	if (isLDOMservice(nxgep)) {
-		if (tx_ring_p->tx_ring_offline) {
-			/*
-			 * OFFLINE means that it is in the process of being
-			 * shared - that is, it has been claimed by the HIO
-			 * code, but hasn't been unlinked from <group> yet.
-			 * So in this case use the first TDC, which always
-			 * belongs to the service domain and can't be shared.
-			 */
-			ring_index = 0;
-			tx_ring_p = tx_rings[group->legend[ring_index]];
-		}
-	}
-
-	NXGE_DEBUG_MSG((nxgep, TX_CTL, "count %d, tx_rings[%d] = %p",
-	    (int)group->count, group->legend[ring_index], tx_ring_p));
-
-	switch (nxge_tx_scheme) {
-	case NXGE_USE_START:
-		if (nxge_start(nxgep, tx_ring_p, mp)) {
-			NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_send: failed "
-			    "ring index %d", ring_index));
-			return (B_FALSE);
-		}
-		break;
-
-	case NXGE_USE_SERIAL:
-	default:
-		nxge_serialize_enter(tx_ring_p->serial, mp);
-		break;
-	}
-
-	NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_send: ring index %d",
-	    ring_index));
-
-	return (B_TRUE);
-}
-
-/*
- * nxge_m_tx() - send a chain of packets
- */
-mblk_t *
-nxge_m_tx(void *arg, mblk_t *mp)
-{
-	p_nxge_t 		nxgep = (p_nxge_t)arg;
-	mblk_t 			*next;
-	mac_tx_hint_t		hint;
-
-	NXGE_DEBUG_MSG((nxgep, DDI_CTL, "==> nxge_m_tx"));
-
-	if ((!(nxgep->drv_state & STATE_HW_INITIALIZED)) ||
-	    (nxgep->nxge_mac_state != NXGE_MAC_STARTED)) {
-		NXGE_DEBUG_MSG((nxgep, DDI_CTL,
-		    "==> nxge_m_tx: hardware not initialized"));
-		NXGE_DEBUG_MSG((nxgep, DDI_CTL,
-		    "<== nxge_m_tx"));
-		freemsgchain(mp);
-		mp = NULL;
-		return (mp);
-	}
-
-	hint.hash =  NULL;
-	hint.vid =  0;
-	hint.sap =  0;
-
-	while (mp != NULL) {
-		next = mp->b_next;
-		mp->b_next = NULL;
-
-		/*
-		 * Until Nemo tx resource works, the mac driver
-		 * does the load balancing based on TCP port,
-		 * or CPU. For debugging, we use a system
-		 * configurable parameter.
-		 */
-		if (!nxge_send(nxgep, mp, &hint)) {
-			mp->b_next = next;
-			break;
-		}
-
-		mp = next;
-
-		NXGE_DEBUG_MSG((NULL, TX_CTL,
-		    "==> nxge_m_tx: (go back to loop) mp $%p next $%p",
-		    mp, next));
-	}
-
-	NXGE_DEBUG_MSG((nxgep, DDI_CTL, "<== nxge_m_tx"));
-	return (mp);
-}
-
-int
-nxge_tx_lb_ring_1(p_mblk_t mp, uint32_t maxtdcs, p_mac_tx_hint_t hp)
-{
-	uint8_t 		ring_index = 0;
-	uint8_t 		*tcp_port;
-	p_mblk_t 		nmp;
-	size_t 			mblk_len;
-	size_t 			iph_len;
-	size_t 			hdrs_size;
-	uint8_t			hdrs_buf[sizeof (struct  ether_vlan_header) +
-	    IP_MAX_HDR_LENGTH + sizeof (uint32_t)];
-				/*
-				 * allocate space big enough to cover
-				 * the max ip header length and the first
-				 * 4 bytes of the TCP/IP header.
-				 */
-
-	boolean_t		qos = B_FALSE;
-	ushort_t		eth_type;
-	size_t 			eth_hdr_size;
-
-	NXGE_DEBUG_MSG((NULL, TX_CTL, "==> nxge_tx_lb_ring"));
-
-	if (hp->vid) {
-		qos = B_TRUE;
-	}
-	switch (nxge_tx_lb_policy) {
-	case NXGE_TX_LB_TCPUDP: /* default IPv4 TCP/UDP */
-	default:
-		tcp_port = mp->b_rptr;
-		eth_type = ntohs(((struct ether_header *)tcp_port)->ether_type);
-		if (eth_type == VLAN_ETHERTYPE) {
-			eth_type = ntohs(((struct ether_vlan_header *)
-			    tcp_port)->ether_type);
-			eth_hdr_size = sizeof (struct ether_vlan_header);
-		} else {
-			eth_hdr_size = sizeof (struct ether_header);
-		}
-
-		if (!nxge_no_tx_lb && !qos && eth_type == ETHERTYPE_IP) {
-			nmp = mp;
-			mblk_len = MBLKL(nmp);
-			tcp_port = NULL;
-			if (mblk_len > eth_hdr_size + sizeof (uint8_t)) {
-				tcp_port = nmp->b_rptr + eth_hdr_size;
-				mblk_len -= eth_hdr_size;
-				iph_len = ((*tcp_port) & 0x0f) << 2;
-				if (mblk_len > (iph_len + sizeof (uint32_t))) {
-					tcp_port = nmp->b_rptr;
-				} else {
-					tcp_port = NULL;
-				}
-			}
-			if (tcp_port == NULL) {
-				hdrs_size = 0;
-				while ((nmp) && (hdrs_size <
-				    sizeof (hdrs_buf))) {
-					mblk_len = MBLKL(nmp);
-					if (mblk_len >=
-					    (sizeof (hdrs_buf) - hdrs_size))
-						mblk_len = sizeof (hdrs_buf) -
-						    hdrs_size;
-					bcopy(nmp->b_rptr,
-					    &hdrs_buf[hdrs_size], mblk_len);
-					hdrs_size += mblk_len;
-					nmp = nmp->b_cont;
-				}
-				tcp_port = hdrs_buf;
-			}
-			tcp_port += eth_hdr_size;
-			if (!(tcp_port[6] & 0x3f) && !(tcp_port[7] & 0xff)) {
-				switch (tcp_port[9]) {
-				case IPPROTO_TCP:
-				case IPPROTO_UDP:
-				case IPPROTO_ESP:
-					tcp_port += ((*tcp_port) & 0x0f) << 2;
-					ring_index =
-					    ((tcp_port[0] ^
-					    tcp_port[1] ^
-					    tcp_port[2] ^
-					    tcp_port[3]) % maxtdcs);
-					break;
-
-				case IPPROTO_AH:
-					/* SPI starts at the 4th byte */
-					tcp_port += ((*tcp_port) & 0x0f) << 2;
-					ring_index =
-					    ((tcp_port[4] ^
-					    tcp_port[5] ^
-					    tcp_port[6] ^
-					    tcp_port[7]) % maxtdcs);
-					break;
-
-				default:
-					ring_index = tcp_port[19] % maxtdcs;
-					break;
-				}
-			} else { /* fragmented packet */
-				ring_index = tcp_port[19] % maxtdcs;
-			}
-		} else {
-			ring_index = mp->b_band % maxtdcs;
-		}
-		break;
-
-	case NXGE_TX_LB_HASH:
-		if (hp->hash) {
-#if defined(__i386)
-			ring_index = ((uint32_t)(hp->hash) % maxtdcs);
-#else
-			ring_index = ((uint64_t)(hp->hash) % maxtdcs);
-#endif
-		} else {
-			ring_index = mp->b_band % maxtdcs;
-		}
-		break;
-
-	case NXGE_TX_LB_DEST_MAC: /* Use destination MAC address */
-		tcp_port = mp->b_rptr;
-		ring_index = tcp_port[5] % maxtdcs;
-		break;
-	}
-
-	NXGE_DEBUG_MSG((NULL, TX_CTL, "<== nxge_tx_lb_ring"));
-
-	return (ring_index);
-}
-
-uint_t
-nxge_reschedule(caddr_t arg)
-{
-	p_nxge_t nxgep;
-
-	nxgep = (p_nxge_t)arg;
-
-	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_reschedule"));
-
-	if (nxgep->nxge_mac_state == NXGE_MAC_STARTED &&
-	    nxgep->resched_needed) {
-		if (!isLDOMguest(nxgep))
-			mac_tx_update(nxgep->mach);
-#if defined(sun4v)
-		else {		/* isLDOMguest(nxgep) */
-			nxge_hio_data_t *nhd = (nxge_hio_data_t *)
-			    nxgep->nxge_hw_p->hio;
-			nx_vio_fp_t *vio = &nhd->hio.vio;
-
-			/* Call back vnet. */
-			if (vio->cb.vio_net_tx_update) {
-				(*vio->cb.vio_net_tx_update)
-				    (nxgep->hio_vr->vhp);
-			}
-		}
-#endif
-		nxgep->resched_needed = B_FALSE;
-		nxgep->resched_running = B_FALSE;
-	}
-
-	NXGE_DEBUG_MSG((NULL, TX_CTL, "<== nxge_reschedule"));
-	return (DDI_INTR_CLAIMED);
-}
-
-
 /* Software LSO starts here */
 static void
 nxge_hcksum_retrieve(mblk_t *mp,
diff --git a/usr/src/uts/common/io/nxge/nxge_txdma.c b/usr/src/uts/common/io/nxge/nxge_txdma.c
index 892c7bb65a..766e900da7 100644
--- a/usr/src/uts/common/io/nxge/nxge_txdma.c
+++ b/usr/src/uts/common/io/nxge/nxge_txdma.c
@@ -31,7 +31,7 @@
 #include <sys/llc1.h>
 
 uint32_t 	nxge_reclaim_pending = TXDMA_RECLAIM_PENDING_DEFAULT;
-uint32_t	nxge_tx_minfree = 32;
+uint32_t	nxge_tx_minfree = 64;
 uint32_t	nxge_tx_intr_thres = 0;
 uint32_t	nxge_tx_max_gathers = TX_MAX_GATHER_POINTERS;
 uint32_t	nxge_tx_tiny_pack = 1;
@@ -53,9 +53,7 @@ extern ddi_device_acc_attr_t nxge_dev_buf_dma_acc_attr;
 extern ddi_dma_attr_t nxge_desc_dma_attr;
 extern ddi_dma_attr_t nxge_tx_dma_attr;
 
-extern int nxge_serial_tx(mblk_t *mp, void *arg);
-
-void nxge_txdma_freemsg_task(p_tx_ring_t tx_ring_p);
+extern void nxge_tx_ring_task(void *arg);
 
 static nxge_status_t nxge_map_txdma(p_nxge_t, int);
 
@@ -97,22 +95,25 @@ nxge_init_txdma_channels(p_nxge_t nxgep)
 	nxge_grp_set_t	*set = &nxgep->tx_set;
 	int		i, tdc, count;
 	nxge_grp_t	*group;
+	dc_map_t	map;
+	int		dev_gindex;
 
 	NXGE_DEBUG_MSG((nxgep, MEM2_CTL, "==> nxge_init_txdma_channels"));
 
 	for (i = 0, count = 0; i < NXGE_LOGICAL_GROUP_MAX; i++) {
 		if ((1 << i) & set->lg.map) {
 			group = set->group[i];
-
+			dev_gindex =
+			    nxgep->pt_config.hw_config.def_mac_txdma_grpid + i;
+			map = nxgep->pt_config.tdc_grps[dev_gindex].map;
 			for (tdc = 0; tdc < NXGE_MAX_TDCS; tdc++) {
-				if ((1 << tdc) & group->map) {
-					if ((nxge_grp_dc_add(nxgep, group,
-					    VP_BOUND_TX, tdc)))
+				if ((1 << tdc) & map) {
+					if ((nxge_grp_dc_add(nxgep,
+					    group, VP_BOUND_TX, tdc)))
 						goto init_txdma_channels_exit;
 				}
 			}
 		}
-
 		if (++count == set->lg.count)
 			break;
 	}
@@ -124,21 +125,22 @@ init_txdma_channels_exit:
 	for (i = 0, count = 0; i < NXGE_LOGICAL_GROUP_MAX; i++) {
 		if ((1 << i) & set->lg.map) {
 			group = set->group[i];
-
+			dev_gindex =
+			    nxgep->pt_config.hw_config.def_mac_txdma_grpid + i;
+			map = nxgep->pt_config.tdc_grps[dev_gindex].map;
 			for (tdc = 0; tdc < NXGE_MAX_TDCS; tdc++) {
-				if ((1 << tdc) & group->map) {
+				if ((1 << tdc) & map) {
 					nxge_grp_dc_remove(nxgep,
 					    VP_BOUND_TX, tdc);
 				}
 			}
 		}
-
 		if (++count == set->lg.count)
 			break;
 	}
 
-	NXGE_DEBUG_MSG((nxgep, MEM2_CTL, "<== nxge_init_txdma_channels"));
 	return (NXGE_ERROR);
+
 }
 
 nxge_status_t
@@ -890,44 +892,6 @@ nxge_tx_pkt_nmblocks(p_mblk_t mp, int *tot_xfer_len_p)
 	return (nmblks);
 }
 
-static void
-nxge_txdma_freemsg_list_add(p_tx_ring_t tx_ring_p, p_tx_msg_t msgp)
-{
-	MUTEX_ENTER(&tx_ring_p->freelock);
-	if (tx_ring_p->tx_free_list_p != NULL)
-		msgp->nextp = tx_ring_p->tx_free_list_p;
-	tx_ring_p->tx_free_list_p = msgp;
-	MUTEX_EXIT(&tx_ring_p->freelock);
-}
-
-/*
- * void
- * nxge_txdma_freemsg_task() -- walk the list of messages to be
- *	freed and free the messages.
- */
-void
-nxge_txdma_freemsg_task(p_tx_ring_t tx_ring_p)
-{
-	p_tx_msg_t	msgp, nextp;
-
-	if (tx_ring_p->tx_free_list_p != NULL) {
-		MUTEX_ENTER(&tx_ring_p->freelock);
-		msgp = tx_ring_p->tx_free_list_p;
-		tx_ring_p->tx_free_list_p = (p_tx_msg_t)NULL;
-		MUTEX_EXIT(&tx_ring_p->freelock);
-
-		while (msgp != NULL) {
-			nextp = msgp->nextp;
-			if (msgp->tx_message != NULL) {
-				freemsg(msgp->tx_message);
-				msgp->tx_message = NULL;
-			}
-			msgp->nextp = NULL;
-			msgp = nextp;
-		}
-	}
-}
-
 boolean_t
 nxge_txdma_reclaim(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, int nmblks)
 {
@@ -947,7 +911,7 @@ nxge_txdma_reclaim(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, int nmblks)
 	uint16_t		head_index, tail_index;
 	uint8_t			tdc;
 	boolean_t		head_wrap, tail_wrap;
-	p_nxge_tx_ring_stats_t	tdc_stats;
+	p_nxge_tx_ring_stats_t tdc_stats;
 	int			rc;
 
 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_txdma_reclaim"));
@@ -1093,13 +1057,12 @@ nxge_txdma_reclaim(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, int nmblks)
 			}
 			NXGE_DEBUG_MSG((nxgep, TX_CTL,
 			    "==> nxge_txdma_reclaim: count packets"));
-
 			/*
 			 * count a chained packet only once.
 			 */
 			if (tx_msg_p->tx_message != NULL) {
-				nxge_txdma_freemsg_list_add(tx_ring_p,
-				    tx_msg_p);
+				freemsg(tx_msg_p->tx_message);
+				tx_msg_p->tx_message = NULL;
 			}
 
 			tx_msg_p->flags.dma_type = USE_NONE;
@@ -1223,13 +1186,7 @@ nxge_tx_intr(void *arg1, void *arg2)
 		    "status 0x%08x (mk bit set, calling reclaim)",
 		    channel, vindex, rs));
 
-		MUTEX_ENTER(&tx_ring_p->lock);
-		(void) nxge_txdma_reclaim(nxgep, tx_rings[vindex], 0);
-		MUTEX_EXIT(&tx_ring_p->lock);
-
-		nxge_txdma_freemsg_task(tx_ring_p);
-
-		mac_tx_update(nxgep->mach);
+		nxge_tx_ring_task((void *)tx_ring_p);
 	}
 
 	/*
@@ -1596,7 +1553,6 @@ nxge_txdma_fixup_channel(p_nxge_t nxgep, p_tx_ring_t ring_p, uint16_t channel)
 	ring_p->ring_kick_tail.value = 0;
 	ring_p->descs_pending = 0;
 	MUTEX_EXIT(&ring_p->lock);
-	nxge_txdma_freemsg_task(ring_p);
 
 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_txdma_fixup_channel"));
 }
@@ -1831,7 +1787,6 @@ nxge_txdma_channel_hung(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, uint16_t channel)
 	tail_wrap = tx_ring_p->wr_index_wrap;
 	tx_rd_index = tx_ring_p->rd_index;
 	MUTEX_EXIT(&tx_ring_p->lock);
-	nxge_txdma_freemsg_task(tx_ring_p);
 
 	NXGE_DEBUG_MSG((nxgep, TX_CTL,
 	    "==> nxge_txdma_channel_hung: tdc %d tx_rd_index %d "
@@ -2010,8 +1965,6 @@ nxge_txdma_fixup_hung_channel(p_nxge_t nxgep, p_tx_ring_t ring_p,
 	(void) nxge_txdma_reclaim(nxgep, ring_p, 0);
 	MUTEX_EXIT(&ring_p->lock);
 
-	nxge_txdma_freemsg_task(ring_p);
-
 	handle = NXGE_DEV_NPI_HANDLE(nxgep);
 	/*
 	 * Stop the dma channel waits for the stop done.
@@ -2072,10 +2025,8 @@ nxge_reclaim_rings(p_nxge_t nxgep)
 				NXGE_DEBUG_MSG((nxgep, TX_CTL,
 				    "==> nxge_reclaim_rings: TDC %d", tdc));
 				MUTEX_ENTER(&ring->lock);
-				(void) nxge_txdma_reclaim(nxgep, ring, tdc);
+				(void) nxge_txdma_reclaim(nxgep, ring, 0);
 				MUTEX_EXIT(&ring->lock);
-
-				nxge_txdma_freemsg_task(ring);
 			}
 		}
 	}
@@ -2580,6 +2531,7 @@ nxge_map_txdma_channel_buf_ring(p_nxge_t nxgep, uint16_t channel,
 	int			i, j, index;
 	uint32_t		size, bsize;
 	uint32_t 		nblocks, nmsgs;
+	char			qname[TASKQ_NAMELEN];
 
 	NXGE_DEBUG_MSG((nxgep, MEM3_CTL,
 	    "==> nxge_map_txdma_channel_buf_ring"));
@@ -2611,14 +2563,19 @@ nxge_map_txdma_channel_buf_ring(p_nxge_t nxgep, uint16_t channel,
 	    KMEM_ZALLOC(sizeof (tx_ring_t), KM_SLEEP);
 	MUTEX_INIT(&tx_ring_p->lock, NULL, MUTEX_DRIVER,
 	    (void *)nxgep->interrupt_cookie);
-	MUTEX_INIT(&tx_ring_p->freelock, NULL, MUTEX_DRIVER,
-	    (void *)nxgep->interrupt_cookie);
 
 	(void) atomic_swap_32(&tx_ring_p->tx_ring_offline, NXGE_TX_RING_ONLINE);
 	tx_ring_p->tx_ring_busy = B_FALSE;
 	tx_ring_p->nxgep = nxgep;
-	tx_ring_p->serial = nxge_serialize_create(nmsgs,
-	    nxge_serial_tx, tx_ring_p);
+	tx_ring_p->tx_ring_handle = (mac_ring_handle_t)NULL;
+	(void) snprintf(qname, TASKQ_NAMELEN, "tx_%d_%d",
+	    nxgep->instance, channel);
+	tx_ring_p->taskq = ddi_taskq_create(nxgep->dip, qname, 1,
+	    TASKQ_DEFAULTPRI, 0);
+	if (tx_ring_p->taskq == NULL) {
+		goto nxge_map_txdma_channel_buf_ring_fail1;
+	}
+
 	/*
 	 * Allocate transmit message rings and handles for packets
 	 * not to be copied to premapped buffers.
@@ -2683,7 +2640,6 @@ nxge_map_txdma_channel_buf_ring(p_nxge_t nxgep, uint16_t channel,
 
 		for (j = 0; j < nblocks; j++) {
 			tx_msg_ring[index].buf_dma_handle = tx_buf_dma_handle;
-			tx_msg_ring[index].nextp = NULL;
 			dmap = &tx_msg_ring[index++].buf_dma;
 #ifdef TX_MEM_DEBUG
 			NXGE_DEBUG_MSG((nxgep, MEM3_CTL,
@@ -2705,9 +2661,9 @@ nxge_map_txdma_channel_buf_ring(p_nxge_t nxgep, uint16_t channel,
 	goto nxge_map_txdma_channel_buf_ring_exit;
 
 nxge_map_txdma_channel_buf_ring_fail1:
-	if (tx_ring_p->serial) {
-		nxge_serialize_destroy(tx_ring_p->serial);
-		tx_ring_p->serial = NULL;
+	if (tx_ring_p->taskq) {
+		ddi_taskq_destroy(tx_ring_p->taskq);
+		tx_ring_p->taskq = NULL;
 	}
 
 	index--;
@@ -2716,8 +2672,6 @@ nxge_map_txdma_channel_buf_ring_fail1:
 			ddi_dma_free_handle(&tx_msg_ring[index].dma_handle);
 		}
 	}
-
-	MUTEX_DESTROY(&tx_ring_p->freelock);
 	MUTEX_DESTROY(&tx_ring_p->lock);
 	KMEM_FREE(tx_msg_ring, size);
 	KMEM_FREE(tx_ring_p, sizeof (tx_ring_t));
@@ -2783,12 +2737,11 @@ nxge_unmap_txdma_channel_buf_ring(p_nxge_t nxgep, p_tx_ring_t tx_ring_p)
 
 	MUTEX_EXIT(&tx_ring_p->lock);
 
-	if (tx_ring_p->serial) {
-		nxge_serialize_destroy(tx_ring_p->serial);
-		tx_ring_p->serial = NULL;
+	if (tx_ring_p->taskq) {
+		ddi_taskq_destroy(tx_ring_p->taskq);
+		tx_ring_p->taskq = NULL;
 	}
 
-	MUTEX_DESTROY(&tx_ring_p->freelock);
 	MUTEX_DESTROY(&tx_ring_p->lock);
 	KMEM_FREE(tx_msg_ring, sizeof (tx_msg_t) * tx_ring_p->tx_ring_size);
 	KMEM_FREE(tx_ring_p, sizeof (tx_ring_t));
@@ -3408,8 +3361,6 @@ nxge_txdma_fatal_err_recover(
 	if (status != NXGE_OK)
 		goto fail;
 
-	nxge_txdma_freemsg_task(tx_ring_p);
-
 	NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL,
 	    "Recovery Successful, TxDMAChannel#%d Restored",
 	    channel));
@@ -3420,8 +3371,6 @@ nxge_txdma_fatal_err_recover(
 fail:
 	MUTEX_EXIT(&tx_ring_p->lock);
 
-	nxge_txdma_freemsg_task(tx_ring_p);
-
 	NXGE_DEBUG_MSG((nxgep, TX_CTL,
 	    "nxge_txdma_fatal_err_recover (channel %d): "
 	    "failed to recover this txdma channel", channel));
@@ -3519,7 +3468,6 @@ nxge_tx_port_fatal_err_recover(p_nxge_t nxgep)
 			tx_ring_t *ring = nxgep->tx_rings->rings[tdc];
 			if (ring) {
 				(void) nxge_txdma_reclaim(nxgep, ring, 0);
-				nxge_txdma_freemsg_task(ring);
 			}
 		}
 	}
diff --git a/usr/src/uts/common/io/nxge/nxge_virtual.c b/usr/src/uts/common/io/nxge/nxge_virtual.c
index 818f8451c2..2498f77e90 100644
--- a/usr/src/uts/common/io/nxge/nxge_virtual.c
+++ b/usr/src/uts/common/io/nxge/nxge_virtual.c
@@ -77,6 +77,12 @@ extern uint32_t nxge_rbr_spare_size;
 
 extern npi_status_t npi_mac_altaddr_disable(npi_handle_t, uint8_t, uint8_t);
 
+/*
+ * XXX: Use temporarily to specify the number of packets each interrupt process
+ * By default, the number of packet processed per interrupt is 1.
+ */
+int	nxge_max_intr_pkts;
+
 static uint8_t p2_tx_fair[2] = {12, 12};
 static uint8_t p2_tx_equal[2] = {12, 12};
 static uint8_t p4_tx_fair[4] = {6, 6, 6, 6};
@@ -783,7 +789,7 @@ nxge_update_txdma_properties(p_nxge_t nxgep, config_token_t token,
 	int ddi_status = DDI_SUCCESS;
 	int num_ports = nxgep->nports;
 	int port, bits, j;
-	uint8_t start_tdc = 0, num_tdc = 0;
+	uint8_t  start_tdc, num_tdc = 0;
 	p_nxge_param_t param_arr;
 	uint32_t tdc_bitmap[MAX_SIBLINGS];
 	int custom_start_tdc[MAX_SIBLINGS];
@@ -1616,6 +1622,14 @@ nxge_get_config_properties(p_nxge_t nxgep)
 	}
 
 	/*
+	 * XXX: read-in the config file to determine the number of packet
+	 * to process by each interrupt.
+	 */
+	nxge_max_intr_pkts = ddi_getprop(DDI_DEV_T_ANY, nxgep->dip,
+	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "max_intr_pkts", 1);
+
+
+	/*
 	 * Get info on how many ports Neptune card has.
 	 */
 	nxgep->nports = nxge_get_nports(nxgep);
@@ -1806,12 +1820,12 @@ nxge_use_default_dma_config_n2(p_nxge_t nxgep)
 		return (NXGE_DDI_FAILED);
 	}
 
-	p_cfgp->tdc.count = nxgep->max_tdcs = ndmas;
+	p_cfgp->tdc.count = ndmas;
 	p_cfgp->tdc.owned = p_cfgp->tdc.count;
 
 	NXGE_DEBUG_MSG((nxgep, OBP_CTL, "==> nxge_use_default_dma_config_n2: "
-	    "p_cfgp 0x%llx max_tdcs %d nxgep->max_tdcs %d start %d",
-	    p_cfgp, p_cfgp->tdc.count, nxgep->max_tdcs, p_cfgp->tdc.start));
+	    "p_cfgp 0x%llx max_tdcs %d start %d",
+	    p_cfgp, p_cfgp->tdc.count, p_cfgp->tdc.start));
 
 	/* Receive DMA */
 	ndmas = NXGE_RDMA_PER_NIU_PORT;
@@ -1834,12 +1848,11 @@ nxge_use_default_dma_config_n2(p_nxge_t nxgep)
 		return (NXGE_DDI_FAILED);
 	}
 
-	p_cfgp->max_rdcs = nxgep->max_rdcs = ndmas;
+	p_cfgp->max_rdcs = ndmas;
 	nxgep->rdc_mask = (ndmas - 1);
 
 	/* Hypervisor: rdc # and group # use the same # !! */
 	p_cfgp->max_grpids = p_cfgp->max_rdcs + p_cfgp->tdc.owned;
-	p_cfgp->start_grpid = 0;
 	p_cfgp->mif_ldvid = p_cfgp->mac_ldvid = p_cfgp->ser_ldvid = 0;
 
 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, nxgep->dip, 0,
@@ -1909,13 +1922,12 @@ nxge_use_default_dma_config_n2(p_nxge_t nxgep)
 
 	p_cfgp->max_ldgs = p_cfgp->max_grpids;
 	NXGE_DEBUG_MSG((nxgep, OBP_CTL,
-	    "==> nxge_use_default_dma_config_n2: "
-	    "p_cfgp 0x%llx max_rdcs %d nxgep->max_rdcs %d max_grpids %d"
-	    "start_grpid %d macid %d mifid %d serrid %d",
-	    p_cfgp, p_cfgp->max_rdcs, nxgep->max_rdcs, p_cfgp->max_grpids,
-	    p_cfgp->start_grpid,
+	    "==> nxge_use_default_dma_config_n2: p_cfgp 0x%llx max_rdcs %d "
+	    "max_grpids %d macid %d mifid %d serrid %d",
+	    p_cfgp, p_cfgp->max_rdcs, p_cfgp->max_grpids,
 	    p_cfgp->mac_ldvid, p_cfgp->mif_ldvid, p_cfgp->ser_ldvid));
 
+
 	NXGE_DEBUG_MSG((nxgep, OBP_CTL, "==> nxge_use_default_dma_config_n2: "
 	    "p_cfgp p%p start_ldg %d nxgep->max_ldgs %d",
 	    p_cfgp, p_cfgp->start_ldg, p_cfgp->max_ldgs));
@@ -1923,12 +1935,14 @@ nxge_use_default_dma_config_n2(p_nxge_t nxgep)
 	/*
 	 * RDC groups and the beginning RDC group assigned to this function.
 	 */
-	p_cfgp->max_rdc_grpids = 1;
-	p_cfgp->def_mac_rxdma_grpid = (nxgep->function_num * 1);
-
-	if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind
-	    (nxgep, p_cfgp->def_mac_rxdma_grpid, B_TRUE))
-	    >= NXGE_MAX_RDC_GRPS) {
+	p_cfgp->max_rdc_grpids = NXGE_MAX_RDC_GROUPS / nxgep->nports;
+	p_cfgp->def_mac_rxdma_grpid =
+	    nxgep->function_num * NXGE_MAX_RDC_GROUPS / nxgep->nports;
+	p_cfgp->def_mac_txdma_grpid =
+	    nxgep->function_num * NXGE_MAX_TDC_GROUPS / nxgep->nports;
+
+	if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind(nxgep,
+	    p_cfgp->def_mac_rxdma_grpid, B_TRUE)) >= NXGE_MAX_RDC_GRPS) {
 		NXGE_ERROR_MSG((nxgep, CFG_CTL,
 		    "nxge_use_default_dma_config_n2(): "
 		    "nxge_fzc_rdc_tbl_bind failed"));
@@ -2060,11 +2074,10 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep)
 		    prop, tx_ndmas);
 	}
 
-	p_cfgp->tdc.count = nxgep->max_tdcs = tx_ndmas;
+	p_cfgp->tdc.count = tx_ndmas;
 	p_cfgp->tdc.owned = p_cfgp->tdc.count;
 	NXGE_DEBUG_MSG((nxgep, CFG_CTL, "==> nxge_use_cfg_dma_config: "
-	    "p_cfgp 0x%llx max_tdcs %d nxgep->max_tdcs %d",
-	    p_cfgp, p_cfgp->tdc.count, nxgep->max_tdcs));
+	    "p_cfgp 0x%llx max_tdcs %d", p_cfgp, p_cfgp->tdc.count));
 
 	prop = param_arr[param_rxdma_channels_begin].fcode_name;
 
@@ -2149,44 +2162,23 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep)
 		    prop, rx_ndmas);
 	}
 
-	p_cfgp->max_rdcs = nxgep->max_rdcs = rx_ndmas;
+	p_cfgp->max_rdcs = rx_ndmas;
 
-	prop = param_arr[param_rdc_grps_start].fcode_name;
-	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, 0, prop,
-	    &prop_val, &prop_len) == DDI_PROP_SUCCESS) {
-		p_cfgp->def_mac_rxdma_grpid = *prop_val;
-		ddi_prop_free(prop_val);
-		if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind
-		    (nxgep, p_cfgp->def_mac_rxdma_grpid, B_TRUE))
-		    >= NXGE_MAX_RDC_GRPS) {
-			NXGE_ERROR_MSG((nxgep, CFG_CTL,
-			    "nxge_use_cfg_dma_config(): "
-			    "nxge_fzc_rdc_tbl_bind failed"));
-			cmn_err(CE_CONT, "nxge%d: group not available!\n",
-			    nxgep->instance);
-			goto nxge_use_cfg_dma_config_exit;
-		}
+	/*
+	 * RDC groups and the beginning RDC group assigned to this function.
+	 * XXX: this may be wrong if prop value is used.
+	 */
+	p_cfgp->def_mac_rxdma_grpid =
+	    nxgep->function_num * NXGE_MAX_RDC_GROUPS / nxgep->nports;
+	p_cfgp->def_mac_txdma_grpid =
+	    nxgep->function_num * NXGE_MAX_TDC_GROUPS / nxgep->nports;
 
-		NXGE_DEBUG_MSG((nxgep, CFG_CTL,
-		    "==> nxge_use_default_dma_config: "
-		    "use property " "start_grpid %d ",
-		    p_cfgp->start_grpid));
-	} else {
-		p_cfgp->def_mac_rxdma_grpid = nxgep->function_num;
-		if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind(
-		    nxgep, p_cfgp->def_mac_rxdma_grpid, B_TRUE)) >=
-		    NXGE_MAX_RDC_GRPS) {
-			cmn_err(CE_CONT, "nxge%d: group not available!\n",
-			    nxgep->instance);
-			goto nxge_use_cfg_dma_config_exit;
-		}
-		(void) ddi_prop_update_int(DDI_DEV_T_NONE, nxgep->dip,
-		    prop, p_cfgp->def_mac_rxdma_grpid);
-		NXGE_DEBUG_MSG((nxgep, CFG_CTL,
-		    "==> nxge_use_default_dma_config: "
-		    "use default "
-		    "start_grpid %d (same as function #)",
-		    p_cfgp->start_grpid));
+	if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind(nxgep,
+	    p_cfgp->def_mac_rxdma_grpid, B_TRUE)) >= NXGE_MAX_RDC_GRPS) {
+		NXGE_ERROR_MSG((nxgep, CFG_CTL,
+		    "nxge_use_default_dma_config2(): "
+		    "nxge_fzc_rdc_tbl_bind failed"));
+		goto nxge_use_cfg_dma_config_exit;
 	}
 
 	prop = param_arr[param_rx_rdc_grps].fcode_name;
@@ -2195,7 +2187,7 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep)
 		nrxgp = *prop_val;
 		ddi_prop_free(prop_val);
 	} else {
-		nrxgp = 1;
+		nrxgp = NXGE_MAX_RDC_GRPS / nxgep->nports;
 		(void) ddi_prop_update_int(DDI_DEV_T_NONE, nxgep->dip,
 		    prop, nrxgp);
 		NXGE_DEBUG_MSG((nxgep, CFG_CTL,
@@ -2203,7 +2195,6 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep)
 		    "num_rdc_grpid not found: use def:# of "
 		    "rdc groups %d\n", nrxgp));
 	}
-
 	p_cfgp->max_rdc_grpids = nrxgp;
 
 	/*
@@ -2213,10 +2204,9 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep)
 	p_cfgp->max_ldgs = NXGE_LDGRP_PER_4PORTS;
 
 	NXGE_DEBUG_MSG((nxgep, CFG_CTL, "==> nxge_use_default_dma_config: "
-	    "p_cfgp 0x%llx max_rdcs %d nxgep->max_rdcs %d max_grpids %d"
-	    "start_grpid %d",
-	    p_cfgp, p_cfgp->max_rdcs, nxgep->max_rdcs, p_cfgp->max_grpids,
-	    p_cfgp->start_grpid));
+	    "p_cfgp 0x%llx max_rdcs %d max_grpids %d default_grpid %d",
+	    p_cfgp, p_cfgp->max_rdcs, p_cfgp->max_grpids,
+	    p_cfgp->def_mac_rxdma_grpid));
 
 	NXGE_DEBUG_MSG((nxgep, CFG_CTL, "==> nxge_use_cfg_dma_config: "
 	    "p_cfgp 0x%016llx start_ldg %d nxgep->max_ldgs %d "
@@ -2264,7 +2254,7 @@ nxge_get_logical_props(p_nxge_t nxgep)
 
 	(void) memset(port, 0, sizeof (*port));
 
-	port->mac_port = 0;	/* := function number */
+	port->mac_port = nxgep->function_num;	/* := function number */
 
 	/*
 	 * alloc_buf_size:
@@ -2300,8 +2290,9 @@ nxge_get_logical_props(p_nxge_t nxgep)
 
 	group = &port->rdc_grps[0];
 
-	group->flag = 1;	/* configured */
+	group->flag = B_TRUE;	/* configured */
 	group->config_method = RDC_TABLE_ENTRY_METHOD_REP;
+	group->port = NXGE_GET_PORT_NUM(nxgep->function_num);
 
 	/* HIO futures: this is still an open question. */
 	hardware->max_macs = 1;
@@ -2407,129 +2398,138 @@ nxge_set_rdc_intr_property(p_nxge_t nxgep)
 static void
 nxge_set_hw_dma_config(p_nxge_t nxgep)
 {
-	int i, ndmas, ngrps, bitmap, end, st_rdc;
-	int32_t status;
-	uint8_t rdcs_per_grp;
-	p_nxge_dma_pt_cfg_t p_dma_cfgp;
-	p_nxge_hw_pt_cfg_t p_cfgp;
-	p_nxge_rdc_grp_t rdc_grp_p;
-	int rdcgrp_cfg = CFG_NOT_SPECIFIED, rx_quick_cfg;
-	char *prop, *prop_val;
-	p_nxge_param_t param_arr;
-	config_token_t token;
-	nxge_grp_t *group;
+	int			i, j, ngrps, bitmap, end, st_rdc;
+	p_nxge_dma_pt_cfg_t	p_dma_cfgp;
+	p_nxge_hw_pt_cfg_t	p_cfgp;
+	p_nxge_rdc_grp_t	rdc_grp_p;
+	p_nxge_tdc_grp_t	tdc_grp_p;
+	nxge_grp_t		*group;
+	uint8_t			nrdcs;
+	dc_map_t		map = 0;
 
 	NXGE_DEBUG_MSG((nxgep, CFG_CTL, "==> nxge_set_hw_dma_config"));
 
 	p_dma_cfgp = (p_nxge_dma_pt_cfg_t)&nxgep->pt_config;
 	p_cfgp = (p_nxge_hw_pt_cfg_t)&p_dma_cfgp->hw_config;
-	rdc_grp_p = p_dma_cfgp->rdc_grps;
 
+	switch (nxgep->niu_type) {
+	case NEPTUNE_4_1GC:
+	case NEPTUNE_2_10GF_2_1GC:
+	case NEPTUNE_1_10GF_3_1GC:
+	case NEPTUNE_1_1GC_1_10GF_2_1GC:
+	case NEPTUNE_2_10GF_2_1GRF:
+	default:
+		ngrps = 2;
+		break;
+	case NEPTUNE_2_10GF:
+	case NEPTUNE_2_1GRF:
+	case N2_NIU:
+		ngrps = 4;
+		break;
+	}
+
+	/*
+	 * Setup TDC groups
+	 */
 	bitmap = 0;
 	end = p_cfgp->tdc.start + p_cfgp->tdc.owned;
-	p_dma_cfgp->tx_dma_map = 0;
 	for (i = p_cfgp->tdc.start; i < end; i++) {
 		bitmap |= (1 << i);
 	}
 
 	nxgep->tx_set.owned.map |= bitmap; /* Owned, & not shared. */
+	nxgep->tx_set.owned.count = p_cfgp->tdc.owned;
+	p_dma_cfgp->tx_dma_map = bitmap;
 
-	group = (nxge_grp_t *)nxge_grp_add(nxgep, NXGE_TRANSMIT_GROUP);
-	group->map = bitmap;
+	for (i = 0; i < ngrps; i++) {
+		group = (nxge_grp_t *)nxge_grp_add(nxgep,
+		    NXGE_TRANSMIT_GROUP);
+		tdc_grp_p = &p_dma_cfgp->tdc_grps[
+		    p_cfgp->def_mac_txdma_grpid + i];
+		if (i == 0)
+			tdc_grp_p->map = bitmap;
+		else
+			tdc_grp_p->map = 0;
+		/* no ring is associated with a group initially */
+		tdc_grp_p->start_tdc = 0;
+		tdc_grp_p->max_tdcs = 0;
+		tdc_grp_p->grp_index = group->index;
+	}
 
-	p_dma_cfgp->tx_dma_map = bitmap;
-	param_arr = nxgep->param_arr;
+	for (i = 0; i < NXGE_MAX_RDCS; i++) {
+		nxgep->rx_channel_started[i] = B_FALSE;
+	}
 
-	/* Assume RDCs are evenly distributed */
-	rx_quick_cfg = param_arr[param_rx_quick_cfg].value;
-	switch (rx_quick_cfg) {
-	case CFG_NOT_SPECIFIED:
-		prop = "rxdma-grp-cfg";
-		status = ddi_prop_lookup_string(DDI_DEV_T_NONE,
-		    nxgep->dip, 0, prop, (char **)&prop_val);
-		if (status != DDI_PROP_SUCCESS) {
-			NXGE_DEBUG_MSG((nxgep, CFG_CTL,
-			    " property %s not found", prop));
-			rdcgrp_cfg = CFG_L3_DISTRIBUTE;
-		} else {
-			token = nxge_get_config_token(prop_val);
-			switch (token) {
-			case L2_CLASSIFY:
+	/*
+	 * Setup RDC groups
+	 */
+	st_rdc = p_cfgp->start_rdc;
+	for (i = 0; i < ngrps; i++) {
+		/*
+		 * All rings are associated with the default group initially
+		 */
+		if (i == 0) {
+			/* default group */
+			switch (nxgep->niu_type) {
+			case NEPTUNE_4_1GC:
+				nrdcs = rx_4_1G[nxgep->function_num];
+				break;
+			case N2_NIU:
+			case NEPTUNE_2_10GF:
+				nrdcs = rx_2_10G[nxgep->function_num];
+				break;
+			case NEPTUNE_2_10GF_2_1GC:
+				nrdcs = rx_2_10G_2_1G[nxgep->function_num];
 				break;
-			case CLASSIFY:
-			case L3_CLASSIFY:
-			case L3_DISTRIBUTE:
-			case L3_TCAM:
-				rdcgrp_cfg = CFG_L3_DISTRIBUTE;
+			case NEPTUNE_1_10GF_3_1GC:
+				nrdcs = rx_1_10G_3_1G[nxgep->function_num];
+				break;
+			case NEPTUNE_1_1GC_1_10GF_2_1GC:
+				nrdcs = rx_1_1G_1_10G_2_1G[nxgep->function_num];
 				break;
 			default:
-				rdcgrp_cfg = CFG_L3_DISTRIBUTE;
+				switch (nxgep->platform_type) {
+				case P_NEPTUNE_ALONSO:
+					nrdcs =
+					    rx_2_10G_2_1G[nxgep->function_num];
+					break;
+				default:
+					nrdcs = rx_4_1G[nxgep->function_num];
+					break;
+				}
 				break;
 			}
-			ddi_prop_free(prop_val);
+		} else {
+			nrdcs = 0;
 		}
-		break;
-	case CFG_L3_WEB:
-	case CFG_L3_DISTRIBUTE:
-	case CFG_L2_CLASSIFY:
-	case CFG_L3_TCAM:
-		rdcgrp_cfg = rx_quick_cfg;
-		break;
-	default:
-		rdcgrp_cfg = CFG_L3_DISTRIBUTE;
-		break;
-	}
-
-	st_rdc = p_cfgp->start_rdc;
-
-	switch (rdcgrp_cfg) {
-	case CFG_L3_DISTRIBUTE:
-	case CFG_L3_WEB:
-	case CFG_L3_TCAM:
-		ndmas = p_cfgp->max_rdcs;
-		ngrps = 1;
-		rdcs_per_grp = ndmas / ngrps;
-		break;
-	case CFG_L2_CLASSIFY:
-		ndmas = p_cfgp->max_rdcs / 2;
-		if (p_cfgp->max_rdcs < 2)
-			ndmas = 1;
-		ngrps = 1;
-		rdcs_per_grp = ndmas / ngrps;
-		break;
-	default:
-		ngrps = p_cfgp->max_rdc_grpids;
-		ndmas = p_cfgp->max_rdcs;
-		rdcs_per_grp = ndmas / ngrps;
-		break;
-	}
-
-	for (i = 0; i < ngrps; i++) {
-		uint8_t count = rdcs_per_grp;
-		dc_map_t map = 0;
 
 		rdc_grp_p = &p_dma_cfgp->rdc_grps[
 		    p_cfgp->def_mac_rxdma_grpid + i];
-		rdc_grp_p->start_rdc = st_rdc + i * rdcs_per_grp;
-		rdc_grp_p->max_rdcs = rdcs_per_grp;
+		rdc_grp_p->start_rdc = st_rdc;
+		rdc_grp_p->max_rdcs = nrdcs;
 		rdc_grp_p->def_rdc = rdc_grp_p->start_rdc;
 
 		/* default to: 0, 1, 2, 3, ...., 0, 1, 2, 3.... */
-		while (count) {
-			map |= (1 << count);
-			count--;
-		}
-		map >>= 1;	/* In case <start_rdc> is zero (0) */
-		map <<= rdc_grp_p->start_rdc;
+		if (nrdcs != 0) {
+			for (j = 0; j < nrdcs; j++) {
+				map |= (1 << j);
+			}
+			map <<= rdc_grp_p->start_rdc;
+		} else
+			map = 0;
 		rdc_grp_p->map = map;
 
 		nxgep->rx_set.owned.map |= map; /* Owned, & not shared. */
+		nxgep->rx_set.owned.count = nrdcs;
 
 		group = (nxge_grp_t *)nxge_grp_add(nxgep, NXGE_RECEIVE_GROUP);
-		group->map = rdc_grp_p->map;
 
 		rdc_grp_p->config_method = RDC_TABLE_ENTRY_METHOD_SEQ;
-		rdc_grp_p->flag = 1; /* This group has been configured. */
+		rdc_grp_p->flag = B_TRUE; /* This group has been configured. */
+		rdc_grp_p->grp_index = group->index;
+		rdc_grp_p->port = NXGE_GET_PORT_NUM(nxgep->function_num);
+
+		map = 0;
 	}
 
 
@@ -2742,7 +2742,7 @@ nxge_set_hw_mac_class_config(p_nxge_t nxgep)
 				    " id %d grp %d",
 				    mac_map->param_id, mac_map->map_to));
 				mac_host_info[mac_map->param_id].mpr_npr =
-				    mac_map->pref;
+				    p_cfgp->mac_pref;
 				mac_host_info[mac_map->param_id].rdctbl =
 				    mac_map->map_to +
 				    p_cfgp->def_mac_rxdma_grpid;
@@ -2967,16 +2967,12 @@ nxge_ldgv_init_n2(p_nxge_t nxgep, int *navail_p, int *nrequired_p)
 	}
 
 	/*
-	 * Port0 uses the HW based syserr interrupt, and port1 uses the
-	 * SW based syserr interrupt. There is only one syserr and the
-	 * function zero device gets it.
+	 * HW based syserr interrupt for port0, and SW based syserr interrupt
+	 * for port1
 	 */
 	if (own_sys_err && p_cfgp->ser_ldvid) {
 		ldv = p_cfgp->ser_ldvid;
 		/*
-		 * Port0 - HW based: use an intr vector
-		 */
-		/*
 		 * Unmask the system interrupt states.
 		 */
 		(void) nxge_fzc_sys_err_mask_set(nxgep, SYS_ERR_SMX_MASK |
@@ -2999,8 +2995,8 @@ nxge_ldgv_init_n2(p_nxge_t nxgep, int *navail_p, int *nrequired_p)
 		nldvs++;
 	} else {
 		/*
-		 * Port1 - SW based: allocate the ldv for the syserr since
-		 * the vector should not be consumed for port1
+		 * SW based: allocate the ldv for the syserr since the vector
+		 * should not be consumed for port1
 		 */
 		sysldvp = KMEM_ZALLOC(sizeof (nxge_ldv_t), KM_SLEEP);
 		sysldvp->use_timer = B_TRUE;
@@ -3010,9 +3006,10 @@ nxge_ldgv_init_n2(p_nxge_t nxgep, int *navail_p, int *nrequired_p)
 		sysldvp->ldv_ldf_masks = 0;
 		sysldvp->nxgep = nxgep;
 		ldgvp->ldvp_syserr = sysldvp;
-		ldgvp->ldvp_syserr_allocated = B_TRUE;
+		ldgvp->ldvp_syserr_alloced = B_TRUE;
 	}
 
+
 	NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_ldgv_init_n2: "
 	    "(before rx) func %d nldvs %d navail %d nrequired %d",
 	    func, nldvs, *navail_p, *nrequired_p));
@@ -3326,7 +3323,7 @@ nxge_ldgv_uninit(p_nxge_t nxgep)
 		    "no logical group configured."));
 		return (NXGE_OK);
 	}
-	if (ldgvp->ldvp_syserr_allocated == B_TRUE) {
+	if (ldgvp->ldvp_syserr_alloced == B_TRUE) {
 		KMEM_FREE(ldgvp->ldvp_syserr, sizeof (nxge_ldv_t));
 	}
 	if (ldgvp->ldgp) {
@@ -3925,3 +3922,29 @@ nxge_init_mmac(p_nxge_t nxgep, boolean_t compute_addrs)
 	nxgep->statsp->mmac_stats.mmac_max_cnt = mmac_info->num_mmac;
 	nxgep->statsp->mmac_stats.mmac_avail_cnt = mmac_info->num_mmac;
 }
+
+/*
+ * Convert an RDC group index into a port ring index.  That is, map
+ * <groupid> to an index into nxgep->rx_ring_handles.
+ * (group ring index -> port ring index)
+ */
+int
+nxge_get_rxring_index(p_nxge_t nxgep, int groupid, int ringidx)
+{
+	int			i;
+	int			index = 0;
+	p_nxge_rdc_grp_t	rdc_grp_p;
+	p_nxge_dma_pt_cfg_t	p_dma_cfgp;
+	p_nxge_hw_pt_cfg_t	p_cfgp;
+
+	p_dma_cfgp = &nxgep->pt_config;
+	p_cfgp = &p_dma_cfgp->hw_config;
+
+	for (i = 0; i < groupid; i++) {
+		rdc_grp_p =
+		    &p_dma_cfgp->rdc_grps[p_cfgp->def_mac_rxdma_grpid + i];
+		index += rdc_grp_p->max_rdcs;
+	}
+
+	return (index + ringidx);
+}
diff --git a/usr/src/uts/common/io/pcan/pcan.c b/usr/src/uts/common/io/pcan/pcan.c
index b5b0604831..498a9eea60 100644
--- a/usr/src/uts/common/io/pcan/pcan.c
+++ b/usr/src/uts/common/io/pcan/pcan.c
@@ -46,7 +46,7 @@
 #include <sys/pccard.h>
 #include <sys/pci.h>
 #include <sys/policy.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/stream.h>
 #include <inet/common.h>
 #include <inet/nd.h>
@@ -104,7 +104,6 @@ mac_callbacks_t pcan_m_callbacks = {
 	pcan_sdmulti,
 	pcan_saddr,
 	pcan_tx,
-	NULL,
 	pcan_ioctl
 };
 
diff --git a/usr/src/uts/common/io/pcwl/pcwl.c b/usr/src/uts/common/io/pcwl/pcwl.c
index f8d0cd2c4b..a2bad90c68 100644
--- a/usr/src/uts/common/io/pcwl/pcwl.c
+++ b/usr/src/uts/common/io/pcwl/pcwl.c
@@ -46,7 +46,7 @@
 #include <sys/pccard.h>
 #include <sys/pci.h>
 #include <sys/policy.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/stream.h>
 #include <inet/common.h>
 #include <inet/nd.h>
@@ -89,7 +89,6 @@ mac_callbacks_t pcwl_m_callbacks = {
 	pcwl_sdmulti,
 	pcwl_saddr,
 	pcwl_tx,
-	NULL,
 	pcwl_ioctl
 };
 
diff --git a/usr/src/uts/common/io/ral/rt2560.c b/usr/src/uts/common/io/ral/rt2560.c
index d1473e1972..e6feee3ff4 100644
--- a/usr/src/uts/common/io/ral/rt2560.c
+++ b/usr/src/uts/common/io/ral/rt2560.c
@@ -43,7 +43,7 @@
 #include <sys/modctl.h>
 #include <sys/devops.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_wifi.h>
 #include <sys/net80211.h>
 #include <sys/net80211_proto.h>
@@ -196,7 +196,6 @@ static mac_callbacks_t rt2560_m_callbacks = {
 	rt2560_m_multicst,
 	rt2560_m_unicst,
 	rt2560_m_tx,
-	NULL,		/* mc_resources; */
 	rt2560_m_ioctl,
 	NULL,		/* mc_getcapab */
 	NULL,
diff --git a/usr/src/uts/common/io/rge/rge.h b/usr/src/uts/common/io/rge/rge.h
index 4cab63b289..4a58da1c92 100755..100644
--- a/usr/src/uts/common/io/rge/rge.h
+++ b/usr/src/uts/common/io/rge/rge.h
@@ -26,8 +26,6 @@
 #ifndef _RGE_H
 #define	_RGE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -59,7 +57,7 @@ extern "C" {
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 
 /*
@@ -430,7 +428,6 @@ typedef struct rge {
 	uint32_t		rf_next;	/* current free buf index */
 	uint32_t		rc_next;	/* current recycle buf index */
 	uint32_t		rx_free;	/* number of rx free buf */
-	mac_resource_handle_t	handle;
 
 	/* used for send */
 	rge_bd_t		*tx_ring;
@@ -705,7 +702,7 @@ void rge_chip_init(rge_t *rgep);
 void rge_chip_start(rge_t *rgep);
 void rge_chip_stop(rge_t *rgep, boolean_t fault);
 void rge_chip_sync(rge_t *rgep, enum rge_sync_op todo);
-void rge_chip_blank(void *arg, time_t ticks, uint_t count);
+void rge_chip_blank(void *arg, time_t ticks, uint_t count, int flag);
 void rge_tx_trigger(rge_t *rgep);
 void rge_hw_stats_dump(rge_t *rgep);
 uint_t rge_intr(caddr_t arg1, caddr_t arg2);
diff --git a/usr/src/uts/common/io/rge/rge_chip.c b/usr/src/uts/common/io/rge/rge_chip.c
index 6210fc25fc..c509e01ebb 100644
--- a/usr/src/uts/common/io/rge/rge_chip.c
+++ b/usr/src/uts/common/io/rge/rge_chip.c
@@ -1258,11 +1258,12 @@ rge_chip_sync(rge_t *rgep, enum rge_sync_op todo)
 	}
 }
 
-void rge_chip_blank(void *arg, time_t ticks, uint_t count);
+void rge_chip_blank(void *arg, time_t ticks, uint_t count, int flag);
 #pragma	no_inline(rge_chip_blank)
 
+/* ARGSUSED */
 void
-rge_chip_blank(void *arg, time_t ticks, uint_t count)
+rge_chip_blank(void *arg, time_t ticks, uint_t count, int flag)
 {
 	_NOTE(ARGUNUSED(arg, ticks, count));
 }
diff --git a/usr/src/uts/common/io/rge/rge_main.c b/usr/src/uts/common/io/rge/rge_main.c
index c473a86b7f..ab9ed63203 100644
--- a/usr/src/uts/common/io/rge/rge_main.c
+++ b/usr/src/uts/common/io/rge/rge_main.c
@@ -109,11 +109,10 @@ static void		rge_m_stop(void *);
 static int		rge_m_promisc(void *, boolean_t);
 static int		rge_m_multicst(void *, boolean_t, const uint8_t *);
 static int		rge_m_unicst(void *, const uint8_t *);
-static void		rge_m_resources(void *);
 static void		rge_m_ioctl(void *, queue_t *, mblk_t *);
 static boolean_t	rge_m_getcapab(void *, mac_capab_t, void *);
 
-#define	RGE_M_CALLBACK_FLAGS	(MC_RESOURCES | MC_IOCTL | MC_GETCAPAB)
+#define	RGE_M_CALLBACK_FLAGS	(MC_IOCTL | MC_GETCAPAB)
 
 static mac_callbacks_t rge_m_callbacks = {
 	RGE_M_CALLBACK_FLAGS,
@@ -124,7 +123,6 @@ static mac_callbacks_t rge_m_callbacks = {
 	rge_m_multicst,
 	rge_m_unicst,
 	rge_m_tx,
-	rge_m_resources,
 	rge_m_ioctl,
 	rge_m_getcapab
 };
@@ -1249,28 +1247,6 @@ rge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
 	}
 }
 
-static void
-rge_m_resources(void *arg)
-{
-	rge_t *rgep = arg;
-	mac_rx_fifo_t mrf;
-
-	mutex_enter(rgep->genlock);
-
-	/*
-	 * Register Rx rings as resources and save mac
-	 * resource id for future reference
-	 */
-	mrf.mrf_type = MAC_RX_FIFO;
-	mrf.mrf_blank = rge_chip_blank;
-	mrf.mrf_arg = (void *)rgep;
-	mrf.mrf_normal_blank_time = RGE_RX_INT_TIME;
-	mrf.mrf_normal_pkt_count = RGE_RX_INT_PKTS;
-	rgep->handle = mac_resource_add(rgep->mh, (mac_resource_t *)&mrf);
-
-	mutex_exit(rgep->genlock);
-}
-
 /* ARGSUSED */
 static boolean_t
 rge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
@@ -1302,12 +1278,6 @@ rge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 		}
 		break;
 	}
-	case MAC_CAPAB_POLL:
-		/*
-		 * There's nothing for us to fill in, simply returning
-		 * B_TRUE stating that we support polling is sufficient.
-		 */
-		break;
 	default:
 		return (B_FALSE);
 	}
diff --git a/usr/src/uts/common/io/rge/rge_rxtx.c b/usr/src/uts/common/io/rge/rge_rxtx.c
index 301b023e5a..09d23825d3 100755..100644
--- a/usr/src/uts/common/io/rge/rge_rxtx.c
+++ b/usr/src/uts/common/io/rge/rge_rxtx.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include "rge.h"
 
 #define	U32TOPTR(x)	((void *)(uintptr_t)(uint32_t)(x))
@@ -369,7 +367,7 @@ rge_receive(rge_t *rgep)
 	mutex_exit(rgep->rx_lock);
 
 	if (mp != NULL)
-		mac_rx(rgep->mh, rgep->handle, mp);
+		mac_rx(rgep->mh, NULL, mp);
 }
 
 
diff --git a/usr/src/uts/common/io/rtw/rtw.c b/usr/src/uts/common/io/rtw/rtw.c
index 1b99f01099..fa471c83a8 100644
--- a/usr/src/uts/common/io/rtw/rtw.c
+++ b/usr/src/uts/common/io/rtw/rtw.c
@@ -54,7 +54,7 @@
 #include <sys/sunddi.h>
 #include <sys/pci.h>
 #include <sys/errno.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/dlpi.h>
 #include <sys/ethernet.h>
 #include <sys/list.h>
@@ -178,7 +178,6 @@ static mac_callbacks_t rtw_m_callbacks = {
 	rtw_m_multicst,
 	rtw_m_unicst,
 	rtw_m_tx,
-	NULL,
 	rtw_m_ioctl,
 	NULL,		/* mc_getcapab */
 	NULL,
diff --git a/usr/src/uts/common/io/rum/rum.c b/usr/src/uts/common/io/rum/rum.c
index 8b09c53171..6c61cbbebd 100644
--- a/usr/src/uts/common/io/rum/rum.c
+++ b/usr/src/uts/common/io/rum/rum.c
@@ -43,7 +43,7 @@
 #include <sys/modctl.h>
 #include <sys/devops.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_wifi.h>
 #include <sys/net80211.h>
 #include <sys/net80211_proto.h>
@@ -291,7 +291,6 @@ static mac_callbacks_t rum_m_callbacks = {
 	rum_m_multicst,
 	rum_m_unicst,
 	rum_m_tx,
-	NULL,		/* mc_resources; */
 	rum_m_ioctl,
 	NULL,		/* mc_getcapab */
 	NULL,
diff --git a/usr/src/uts/common/io/sfe/sfe_util.c b/usr/src/uts/common/io/sfe/sfe_util.c
index 0d8f736d15..fdee7b6d2f 100644
--- a/usr/src/uts/common/io/sfe/sfe_util.c
+++ b/usr/src/uts/common/io/sfe/sfe_util.c
@@ -32,6 +32,11 @@
  */
 
 /*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
  * System Header files.
  */
 #include <sys/types.h>
@@ -1958,7 +1963,7 @@ next:
 		 * send up received packets
 		 */
 		mutex_exit(&dp->intrlock);
-		mac_rx(dp->mh, dp->mac_rx_ring_ha, rx_head);
+		mac_rx(dp->mh, NULL, rx_head);
 		mutex_enter(&dp->intrlock);
 	}
 
@@ -4050,11 +4055,10 @@ static int		gem_m_setpromisc(void *, boolean_t);
 static int		gem_m_multicst(void *, boolean_t, const uint8_t *);
 static int		gem_m_unicst(void *, const uint8_t *);
 static mblk_t		*gem_m_tx(void *, mblk_t *);
-static void		gem_m_resources(void *);
 static void		gem_m_ioctl(void *, queue_t *, mblk_t *);
 static boolean_t	gem_m_getcapab(void *, mac_capab_t, void *);
 
-#define	GEM_M_CALLBACK_FLAGS	(MC_RESOURCES | MC_IOCTL | MC_GETCAPAB)
+#define	GEM_M_CALLBACK_FLAGS	(MC_IOCTL | MC_GETCAPAB)
 
 static mac_callbacks_t gem_m_callbacks = {
 	GEM_M_CALLBACK_FLAGS,
@@ -4065,7 +4069,6 @@ static mac_callbacks_t gem_m_callbacks = {
 	gem_m_multicst,
 	gem_m_unicst,
 	gem_m_tx,
-	gem_m_resources,
 	gem_m_ioctl,
 	gem_m_getcapab,
 };
@@ -4590,45 +4593,6 @@ gem_m_tx(void *arg, mblk_t *mp)
 }
 
 static void
-gem_set_coalease(void *arg, time_t ticks, uint_t count)
-{
-	struct gem_dev *dp = arg;
-	DPRINTF(1, (CE_CONT, "%s: %s: ticks:%d count:%d",
-	    dp->name, __func__, ticks, count));
-
-	mutex_enter(&dp->intrlock);
-	dp->poll_pkt_delay = min(count, dp->gc.gc_rx_ring_size/2);
-	mutex_exit(&dp->intrlock);
-}
-
-static void
-gem_m_resources(void *arg)
-{
-	struct gem_dev		*dp = arg;
-	mac_rx_fifo_t		mrf;
-
-	DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__));
-
-	mutex_enter(&dp->intrlock);
-	mutex_enter(&dp->xmitlock);
-
-	/*
-	 * Register Rx rings as resources and save mac
-	 * resource id for future reference
-	 */
-	mrf.mrf_type = MAC_RX_FIFO;
-	mrf.mrf_blank = gem_set_coalease;
-	mrf.mrf_arg = (void *)dp;
-	mrf.mrf_normal_blank_time = 1; /* in uS */
-	mrf.mrf_normal_pkt_count = dp->poll_pkt_delay;
-
-	dp->mac_rx_ring_ha = mac_resource_add(dp->mh, (mac_resource_t *)&mrf);
-
-	mutex_exit(&dp->xmitlock);
-	mutex_exit(&dp->intrlock);
-}
-
-static void
 gem_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
 {
 	DPRINTF(0, (CE_CONT, "!%s: %s: called",
@@ -4637,18 +4601,11 @@ gem_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
 	gem_mac_ioctl((struct gem_dev *)arg, wq, mp);
 }
 
+/* ARGSUSED */
 static boolean_t
 gem_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 {
-	boolean_t	ret;
-
-	ret = B_FALSE;
-	switch (cap) {
-	case MAC_CAPAB_POLL:
-		ret = B_TRUE;
-		break;
-	}
-	return (ret);
+	return (B_FALSE);
 }
 
 static void
diff --git a/usr/src/uts/common/io/sfe/sfe_util.h b/usr/src/uts/common/io/sfe/sfe_util.h
index 576a3d5d08..6c8ca8fea4 100644
--- a/usr/src/uts/common/io/sfe/sfe_util.h
+++ b/usr/src/uts/common/io/sfe/sfe_util.h
@@ -31,9 +31,14 @@
  * DAMAGE.
  */
 
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
 #ifndef _SFE_UTIL_H_
 #define	_SFE_UTIL_H_
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 
 /*
diff --git a/usr/src/uts/common/io/softmac/softmac_ctl.c b/usr/src/uts/common/io/softmac/softmac_ctl.c
index b1b8cd4f42..99c665aae6 100644
--- a/usr/src/uts/common/io/softmac/softmac_ctl.c
+++ b/usr/src/uts/common/io/softmac/softmac_ctl.c
@@ -23,9 +23,9 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/stropts.h>
+#include <sys/strsubr.h>
+#include <sys/callb.h>
 #include <sys/softmac_impl.h>
 
 int
@@ -192,11 +192,9 @@ softmac_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
 }
 
 static void
-softmac_process_notify_ind(queue_t *rq, mblk_t *mp)
+softmac_process_notify_ind(softmac_t *softmac, mblk_t *mp)
 {
-	softmac_lower_t	*slp = rq->q_ptr;
 	dl_notify_ind_t	*dlnip = (dl_notify_ind_t *)mp->b_rptr;
-	softmac_t	*softmac = slp->sl_softmac;
 	uint_t		addroff, addrlen;
 
 	ASSERT(dlnip->dl_primitive == DL_NOTIFY_IND);
@@ -231,6 +229,73 @@ softmac_process_notify_ind(queue_t *rq, mblk_t *mp)
 	freemsg(mp);
 }
 
+void
+softmac_notify_thread(void *arg)
+{
+	softmac_t	*softmac = arg;
+	callb_cpr_t	cprinfo;
+
+	CALLB_CPR_INIT(&cprinfo, &softmac->smac_mutex, callb_generic_cpr,
+	    "softmac_notify_thread");
+
+	mutex_enter(&softmac->smac_mutex);
+
+	/*
+	 * Quit the thread if smac_mh is unregistered.
+	 */
+	while (softmac->smac_mh != NULL &&
+	    !(softmac->smac_flags & SOFTMAC_NOTIFY_QUIT)) {
+		mblk_t		*mp, *nextmp;
+
+		if ((mp = softmac->smac_notify_head) == NULL) {
+			CALLB_CPR_SAFE_BEGIN(&cprinfo);
+			cv_wait(&softmac->smac_cv, &softmac->smac_mutex);
+			CALLB_CPR_SAFE_END(&cprinfo, &softmac->smac_mutex);
+			continue;
+		}
+
+		softmac->smac_notify_head = softmac->smac_notify_tail = NULL;
+		mutex_exit(&softmac->smac_mutex);
+
+		while (mp != NULL) {
+			nextmp = mp->b_next;
+			mp->b_next = NULL;
+			softmac_process_notify_ind(softmac, mp);
+			mp = nextmp;
+		}
+		mutex_enter(&softmac->smac_mutex);
+	}
+
+	/*
+	 * The softmac is being destroyed, simply free all of the DL_NOTIFY_IND
+	 * messages left in the queue which did not have the chance to be
+	 * processed.
+	 */
+	freemsgchain(softmac->smac_notify_head);
+	softmac->smac_notify_head = softmac->smac_notify_tail = NULL;
+	softmac->smac_notify_thread = NULL;
+	cv_broadcast(&softmac->smac_cv);
+	CALLB_CPR_EXIT(&cprinfo);
+	thread_exit();
+}
+
+static void
+softmac_enqueue_notify_ind(queue_t *rq, mblk_t *mp)
+{
+	softmac_lower_t	*slp = rq->q_ptr;
+	softmac_t	*softmac = slp->sl_softmac;
+
+	mutex_enter(&softmac->smac_mutex);
+	if (softmac->smac_notify_tail == NULL) {
+		softmac->smac_notify_head = softmac->smac_notify_tail = mp;
+	} else {
+		softmac->smac_notify_tail->b_next = mp;
+		softmac->smac_notify_tail = mp;
+	}
+	cv_broadcast(&softmac->smac_cv);
+	mutex_exit(&softmac->smac_mutex);
+}
+
 static void
 softmac_process_dlpi(softmac_lower_t *slp, mblk_t *mp, uint_t minlen,
     t_uscalar_t reqprim)
@@ -295,7 +360,29 @@ softmac_rput_process_proto(queue_t *rq, mblk_t *mp)
 		if (len < DL_NOTIFY_IND_SIZE)
 			goto runt;
 
-		softmac_process_notify_ind(rq, mp);
+		/*
+		 * Enqueue all the DL_NOTIFY_IND messages and process them
+		 * in another separate thread to avoid deadlock. Here is an
+		 * example of the deadlock scenario:
+		 *
+		 * Thread A: mac_promisc_set()->softmac_m_promisc()
+		 *
+		 *   The softmac driver waits for the ACK of the
+		 *   DL_PROMISC_PHYS request with the MAC perimeter;
+		 *
+		 * Thread B:
+		 *
+		 *   The driver handles the DL_PROMISC_PHYS request. Before
+		 *   it sends back the ACK, it could first send a
+		 *   DL_NOTE_PROMISC_ON_PHYS notification.
+		 *
+		 * Since DL_NOTIFY_IND could eventually cause softmac to call
+		 * mac_xxx_update(), which requires MAC perimeter, this would
+		 * cause deadlock between the two threads. Enqueuing the
+		 * DL_NOTIFY_IND message and defer its processing would
+		 * avoid the potential deadlock.
+		 */
+		softmac_enqueue_notify_ind(rq, mp);
 		return;
 
 	case DL_NOTIFY_ACK:
diff --git a/usr/src/uts/common/io/softmac/softmac_dev.c b/usr/src/uts/common/io/softmac/softmac_dev.c
index 3d2164e782..f548df055d 100644
--- a/usr/src/uts/common/io/softmac/softmac_dev.c
+++ b/usr/src/uts/common/io/softmac/softmac_dev.c
@@ -222,11 +222,6 @@ softmac_close(queue_t *rq)
 	slp->sl_softmac = NULL;
 	slp->sl_lh = NULL;
 
-	/*
-	 * slp->sl_handle could be non-NULL if it is in the aggregation.
-	 */
-	slp->sl_handle = (mac_resource_handle_t)NULL;
-
 	ASSERT(slp->sl_ack_mp == NULL);
 	ASSERT(slp->sl_ctl_inprogress == B_FALSE);
 	ASSERT(slp->sl_pending_prim == DL_PRIM_INVAL);
@@ -266,6 +261,16 @@ softmac_rput(queue_t *rq, mblk_t *mp)
 		}
 
 		/*
+		 * If this message is looped back from the legacy devices,
+		 * drop it as the Nemo framework will be responsible for
+		 * looping it back by the mac_txloop() function.
+		 */
+		if (mp->b_flag & MSGNOLOOP) {
+			freemsg(mp);
+			return;
+		}
+
+		/*
 		 * This is the most common case.
 		 */
 		if (DB_REF(mp) == 1) {
@@ -276,7 +281,7 @@ softmac_rput(queue_t *rq, mblk_t *mp)
 			 * is reset to NULL when DL_CAPAB_POLL is
 			 * disabled.
 			 */
-			mac_rx(slp->sl_softmac->smac_mh, slp->sl_handle, mp);
+			mac_rx(slp->sl_softmac->smac_mh, NULL, mp);
 			return;
 		} else {
 			softmac_rput_process_data(slp, mp);
diff --git a/usr/src/uts/common/io/softmac/softmac_main.c b/usr/src/uts/common/io/softmac/softmac_main.c
index d325e3b4c6..0187cf8a28 100644
--- a/usr/src/uts/common/io/softmac/softmac_main.c
+++ b/usr/src/uts/common/io/softmac/softmac_main.c
@@ -44,6 +44,8 @@
 #include <sys/file.h>
 #include <sys/cred.h>
 #include <sys/dlpi.h>
+#include <sys/mac_provider.h>
+#include <sys/disp.h>
 #include <sys/sunndi.h>
 #include <sys/modhash.h>
 #include <sys/stropts.h>
@@ -53,11 +55,19 @@
 #include <sys/softmac.h>
 #include <sys/dls.h>
 
+/* Used as a parameter to the mod hash walk of softmac structures */
+typedef struct {
+	softmac_t	*smw_softmac;
+	boolean_t	smw_retry;
+} softmac_walk_t;
+
 /*
  * Softmac hash table including softmacs for both style-2 and style-1 devices.
  */
 static krwlock_t	softmac_hash_lock;
 static mod_hash_t	*softmac_hash;
+static kmutex_t		smac_global_lock;
+static kcondvar_t	smac_global_cv;
 
 #define	SOFTMAC_HASHSZ		64
 
@@ -71,7 +81,7 @@ static void softmac_m_close(void *);
 static boolean_t softmac_m_getcapab(void *, mac_capab_t, void *);
 
 #define	SOFTMAC_M_CALLBACK_FLAGS	\
-	(MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_OPEN | MC_CLOSE)
+	(MC_IOCTL | MC_GETCAPAB | MC_OPEN | MC_CLOSE)
 
 static mac_callbacks_t softmac_m_callbacks = {
 	SOFTMAC_M_CALLBACK_FLAGS,
@@ -82,7 +92,6 @@ static mac_callbacks_t softmac_m_callbacks = {
 	softmac_m_multicst,
 	softmac_m_unicst,
 	softmac_m_tx,
-	softmac_m_resources,
 	softmac_m_ioctl,
 	softmac_m_getcapab,
 	softmac_m_open,
@@ -97,6 +106,8 @@ softmac_init()
 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
 
 	rw_init(&softmac_hash_lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&smac_global_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&smac_global_cv, NULL, CV_DRIVER, NULL);
 }
 
 void
@@ -104,6 +115,8 @@ softmac_fini()
 {
 	rw_destroy(&softmac_hash_lock);
 	mod_hash_destroy_hash(softmac_hash);
+	mutex_destroy(&smac_global_lock);
+	cv_destroy(&smac_global_cv);
 }
 
 /* ARGSUSED */
@@ -128,7 +141,8 @@ softmac_busy()
 }
 
 /*
- * This function is called for each minor node during the post-attach of
+ *
+ * softmac_create() is called for each minor node during the post-attach of
  * each DDI_NT_NET device instance.  Note that it is possible that a device
  * instance has two minor nodes (DLPI style-1 and style-2), so that for that
  * specific device, softmac_create() could be called twice.
@@ -139,7 +153,99 @@ softmac_busy()
  * For each minor node of a legacy device, a taskq is started to finish
  * softmac_mac_register(), which will finish the rest of work (see comments
  * above softmac_mac_register()).
+ *
+ *			softmac state machine
+ * --------------------------------------------------------------------------
+ * OLD STATE		EVENT					NEW STATE
+ * --------------------------------------------------------------------------
+ * UNINIT		attach of 1st minor node 		ATTACH_INPROG
+ * okcnt = 0		net_postattach -> softmac_create	okcnt = 1
+ *
+ * ATTACH_INPROG	attach of 2nd minor node (GLDv3)	ATTACH_DONE
+ * okcnt = 1		net_postattach -> softmac_create	okcnt = 2
+ *
+ * ATTACH_INPROG	attach of 2nd minor node (legacy)	ATTACH_INPROG
+ * okcnt = 1		net_postattach -> softmac_create	okcnt = 2
+ *			schedule softmac_mac_register
+ *
+ * ATTACH_INPROG	legacy device node			ATTACH_DONE
+ * okcnt = 2		softmac_mac_register			okcnt = 2
+ *
+ * ATTACH_DONE		detach of 1st minor node		DETACH_INPROG
+ * okcnt = 2		(success)				okcnt = 1
+ *
+ * DETACH_INPROG	detach of 2nd minor node		UNINIT (or free)
+ * okcnt = 1		(success)				okcnt = 0
+ *
+ * ATTACH_DONE		detach failure				state unchanged
+ * DETACH_INPROG						left = okcnt
+ *
+ * DETACH_INPROG	reattach				ATTACH_INPROG
+ * okcnt = 0,1		net_postattach -> softmac_create
+ *
+ * ATTACH_DONE		reattach				ATTACH_DONE
+ * left != 0		net_postattach -> softmac_create	left = 0
+ *
+ * Abbreviation notes:
+ * states have SOFTMAC_ prefix,
+ * okcnt - softmac_attach_okcnt,
+ * left - softmac_attached_left
  */
+
+#ifdef DEBUG
+void
+softmac_state_verify(softmac_t *softmac)
+{
+	ASSERT(MUTEX_HELD(&softmac->smac_mutex));
+
+	/*
+	 * There are at most 2 minor nodes, one per DLPI style
+	 */
+	ASSERT(softmac->smac_cnt <= 2 && softmac->smac_attachok_cnt <= 2);
+
+	/*
+	 * The smac_attachok_cnt represents the number of attaches i.e. the
+	 * number of times net_postattach -> softmac_create() has been called
+	 * for a device instance.
+	 */
+	ASSERT(softmac->smac_attachok_cnt == SMAC_NONZERO_NODECNT(softmac));
+
+	/*
+	 * softmac_create (or softmac_mac_register) ->  softmac_create_datalink
+	 * happens only after all minor nodes have been attached
+	 */
+	ASSERT(softmac->smac_state != SOFTMAC_ATTACH_DONE ||
+	    softmac->smac_attachok_cnt == softmac->smac_cnt);
+
+	if (softmac->smac_attachok_cnt == 0) {
+		ASSERT(softmac->smac_state == SOFTMAC_UNINIT);
+		ASSERT(softmac->smac_mh == NULL);
+	} else if (softmac->smac_attachok_cnt < softmac->smac_cnt) {
+		ASSERT(softmac->smac_state == SOFTMAC_ATTACH_INPROG ||
+		    softmac->smac_state == SOFTMAC_DETACH_INPROG);
+		ASSERT(softmac->smac_mh == NULL);
+	} else {
+		/*
+		 * In the stable condition the state whould be
+		 * SOFTMAC_ATTACH_DONE. But there is a small transient window
+		 * in softmac_destroy where we change the state to
+		 * SOFTMAC_DETACH_INPROG and drop the lock before doing
+		 * the link destroy
+		 */
+		ASSERT(softmac->smac_attachok_cnt == softmac->smac_cnt);
+		ASSERT(softmac->smac_state != SOFTMAC_UNINIT);
+	}
+	if (softmac->smac_mh != NULL)
+		ASSERT(softmac->smac_attachok_cnt == softmac->smac_cnt);
+}
+#endif
+
+#ifdef DEBUG
+#define	SOFTMAC_STATE_VERIFY(softmac)	softmac_state_verify(softmac)
+#else
+#define	SOFTMAC_STATE_VERIFY(softmac)
+#endif
+
 int
 softmac_create(dev_info_t *dip, dev_t dev)
 {
@@ -181,9 +287,7 @@ softmac_create(dev_info_t *dip, dev_t dev)
 		softmac = kmem_zalloc(sizeof (softmac_t), KM_SLEEP);
 		mutex_init(&softmac->smac_mutex, NULL, MUTEX_DRIVER, NULL);
 		cv_init(&softmac->smac_cv, NULL, CV_DRIVER, NULL);
-		rw_init(&softmac->smac_lock, NULL, RW_DRIVER, NULL);
 		(void) strlcpy(softmac->smac_devname, devname, MAXNAMELEN);
-
 		/*
 		 * Insert the softmac into the hash table.
 		 */
@@ -191,9 +295,15 @@ softmac_create(dev_info_t *dip, dev_t dev)
 		    (mod_hash_key_t)softmac->smac_devname,
 		    (mod_hash_val_t)softmac);
 		ASSERT(err == 0);
+		mutex_enter(&smac_global_lock);
+		cv_broadcast(&smac_global_cv);
+		mutex_exit(&smac_global_lock);
 	}
 
 	mutex_enter(&softmac->smac_mutex);
+	SOFTMAC_STATE_VERIFY(softmac);
+	if (softmac->smac_state != SOFTMAC_ATTACH_DONE)
+		softmac->smac_state = SOFTMAC_ATTACH_INPROG;
 	if (softmac->smac_attachok_cnt == 0) {
 		/*
 		 * Initialize the softmac if this is the post-attach of the
@@ -231,45 +341,26 @@ softmac_create(dev_info_t *dip, dev_t dev)
 	index = (getmajor(dev) == ddi_name_to_major("clone"));
 	if (softmac->smac_softmac[index] != NULL) {
 		/*
-		 * This is possible if the post_attach() is called:
-		 *
-		 * a. after pre_detach() fails.
-		 *
-		 * b. for a new round of reattachment. Note that DACF will not
-		 * call pre_detach() for successfully post_attached minor
-		 * nodes even when the post-attach failed after all.
-		 *
-		 * Both seem to be defects in the DACF framework. To work
-		 * around it and only clear the SOFTMAC_ATTACH_DONE flag for
-		 * the b case, a smac_attached_left field is used to tell
-		 * the two cases apart.
+		 * This is possible if the post_attach() is called after
+		 * pre_detach() fails. This seems to be a defect of the DACF
+		 * framework. We work around it by using a smac_attached_left
+		 * field that tracks this
 		 */
-		ASSERT(softmac->smac_attachok_cnt != 0);
-
-		if (softmac->smac_attached_left != 0)
-			/* case a */
-			softmac->smac_attached_left--;
-		else if (softmac->smac_attachok_cnt != softmac->smac_cnt) {
-			/* case b */
-			softmac->smac_flags &= ~SOFTMAC_ATTACH_DONE;
-		}
+		ASSERT(softmac->smac_attached_left != 0);
+		softmac->smac_attached_left--;
 		mutex_exit(&softmac->smac_mutex);
 		rw_exit(&softmac_hash_lock);
 		return (0);
+
 	}
 	mutex_exit(&softmac->smac_mutex);
 	rw_exit(&softmac_hash_lock);
 
-	/*
-	 * No lock is needed for access this softmac pointer, as pre-detach and
-	 * post-attach won't happen at the same time.
-	 */
-	mutex_enter(&softmac->smac_mutex);
-
 	softmac_dev = kmem_zalloc(sizeof (softmac_dev_t), KM_SLEEP);
 	softmac_dev->sd_dev = dev;
-	softmac->smac_softmac[index] = softmac_dev;
 
+	mutex_enter(&softmac->smac_mutex);
+	softmac->smac_softmac[index] = softmac_dev;
 	/*
 	 * Continue to register the mac and create the datalink only when all
 	 * the minor nodes are attached.
@@ -281,18 +372,22 @@ softmac_create(dev_info_t *dip, dev_t dev)
 
 	/*
 	 * All of the minor nodes have been attached; start a taskq
-	 * to do the rest of the work.  We use a taskq instead of of
+	 * to do the rest of the work.  We use a taskq instead of
 	 * doing the work here because:
 	 *
-	 * - We could be called as a result of an open() system call
-	 *   where spec_open() already SLOCKED the snode.  Using a taskq
-	 *   sidesteps the risk that our ldi_open_by_dev() call would
-	 *   deadlock trying to set SLOCKED on the snode again.
+	 * We could be called as a result of a open() system call
+	 * where spec_open() already SLOCKED the snode. Using a taskq
+	 * sidesteps the risk that our ldi_open_by_dev() call would
+	 * deadlock trying to set SLOCKED on the snode again.
 	 *
-	 * - The devfs design requires no interruptible function calls
-	 *   in the device post-attach routine, but we need to make an
-	 *   (interruptible) upcall.  Using a taskq to make the upcall
-	 *   sidesteps this.
+	 * The devfs design requires that the downcalls don't use any
+	 * interruptible cv_wait which happens when we do door upcalls.
+	 * Otherwise the downcalls which may be holding devfs resources
+	 * may cause a deadlock if the thread is stopped. Also we need to make
+	 * sure these downcalls into softmac_create or softmac_destroy
+	 * don't cv_wait on any devfs related condition. Thus softmac_destroy
+	 * returns EBUSY if the asynchronous threads started in softmac_create
+	 * haven't finished.
 	 */
 	ASSERT(softmac->smac_taskq == NULL);
 	softmac->smac_taskq = taskq_dispatch(system_taskq,
@@ -331,7 +426,6 @@ softmac_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 	 * simply return B_TRUE if we support it.
 	 */
 	case MAC_CAPAB_NO_ZCOPY:
-	case MAC_CAPAB_POLL:
 	case MAC_CAPAB_NO_NATIVEVLAN:
 	default:
 		break;
@@ -396,8 +490,6 @@ softmac_create_datalink(softmac_t *softmac)
 	datalink_id_t	linkid = DATALINK_INVALID_LINKID;
 	int		err;
 
-	ASSERT(MUTEX_HELD(&softmac->smac_mutex));
-
 	/*
 	 * Inform dlmgmtd of this link so that softmac_hold_device() is able
 	 * to know the existence of this link. If this failed with EBADF,
@@ -429,8 +521,11 @@ softmac_create_datalink(softmac_t *softmac)
 		return (err);
 	}
 
-	if (linkid == DATALINK_INVALID_LINKID)
+	if (linkid == DATALINK_INVALID_LINKID) {
+		mutex_enter(&softmac->smac_mutex);
 		softmac->smac_flags |= SOFTMAC_NEED_RECREATE;
+		mutex_exit(&softmac->smac_mutex);
+	}
 
 	return (0);
 }
@@ -453,6 +548,8 @@ softmac_create_task(void *arg)
 	mutex_enter(&softmac->smac_mutex);
 	softmac->smac_media = (mac_info(mh))->mi_nativemedia;
 	softmac->smac_mh = mh;
+	softmac->smac_taskq = NULL;
+	mutex_exit(&softmac->smac_mutex);
 
 	/*
 	 * We can safely release the reference on the mac because
@@ -467,10 +564,13 @@ softmac_create_task(void *arg)
 	 */
 	err = softmac_create_datalink(softmac);
 
+	mutex_enter(&softmac->smac_mutex);
 done:
-	ASSERT(!(softmac->smac_flags & SOFTMAC_ATTACH_DONE));
-	softmac->smac_flags |= SOFTMAC_ATTACH_DONE;
-	softmac->smac_attacherr = err;
+	if (err != 0) {
+		softmac->smac_mh = NULL;
+		softmac->smac_attacherr = err;
+	}
+	softmac->smac_state = SOFTMAC_ATTACH_DONE;
 	softmac->smac_taskq = NULL;
 	cv_broadcast(&softmac->smac_cv);
 	mutex_exit(&softmac->smac_mutex);
@@ -498,6 +598,8 @@ softmac_mac_register(softmac_t *softmac)
 	 * as softmac_destroy() will wait until this function is called.
 	 */
 	ASSERT(softmac != NULL);
+	ASSERT(softmac->smac_state == SOFTMAC_ATTACH_INPROG &&
+	    softmac->smac_attachok_cnt == softmac->smac_cnt);
 
 	if ((err = ldi_ident_from_dip(softmac_dip, &li)) != 0) {
 		mutex_enter(&softmac->smac_mutex);
@@ -617,11 +719,9 @@ softmac_mac_register(softmac_t *softmac)
 		 * dl_bind() because some drivers return DL_ERROR_ACK if the
 		 * stream is not bound. It is also before mac_register(), so
 		 * we don't need any lock protection here.
-		 *
-		 * Softmac always supports POLL.
 		 */
 		softmac->smac_capab_flags =
-		    (MAC_CAPAB_POLL | MAC_CAPAB_NO_ZCOPY | MAC_CAPAB_LEGACY);
+		    (MAC_CAPAB_NO_ZCOPY | MAC_CAPAB_LEGACY);
 
 		softmac->smac_no_capability_req = B_FALSE;
 		if (softmac_fill_capab(lh, softmac) != 0)
@@ -714,6 +814,7 @@ softmac_mac_register(softmac_t *softmac)
 			goto done;
 		}
 	}
+	mutex_exit(&softmac->smac_mutex);
 
 	/*
 	 * Try to create the datalink for this softmac.
@@ -724,10 +825,21 @@ softmac_mac_register(softmac_t *softmac)
 			softmac->smac_mh = NULL;
 		}
 	}
+	/*
+	 * If succeed, create the thread which handles the DL_NOTIFY_IND from
+	 * the lower stream.
+	 */
+	if (softmac->smac_mh != NULL) {
+		softmac->smac_notify_thread = thread_create(NULL, 0,
+		    softmac_notify_thread, softmac, 0, &p0,
+		    TS_RUN, minclsyspri);
+	}
 
+	mutex_enter(&softmac->smac_mutex);
 done:
-	ASSERT(!(softmac->smac_flags & SOFTMAC_ATTACH_DONE));
-	softmac->smac_flags |= SOFTMAC_ATTACH_DONE;
+	ASSERT(softmac->smac_state == SOFTMAC_ATTACH_INPROG &&
+	    softmac->smac_attachok_cnt == softmac->smac_cnt);
+	softmac->smac_state = SOFTMAC_ATTACH_DONE;
 	softmac->smac_attacherr = err;
 	softmac->smac_taskq = NULL;
 	cv_broadcast(&softmac->smac_cv);
@@ -743,24 +855,37 @@ softmac_destroy(dev_info_t *dip, dev_t dev)
 	int			index;
 	int			ppa, err;
 	datalink_id_t		linkid;
+	mac_handle_t		smac_mh;
+	uint32_t		smac_flags;
 
 	ppa = ddi_get_instance(dip);
 	(void) snprintf(devname, MAXNAMELEN, "%s%d", ddi_driver_name(dip), ppa);
 
-	rw_enter(&softmac_hash_lock, RW_WRITER);
+	/*
+	 * We are called only from the predetach entry point. The DACF
+	 * framework ensures there can't be a concurrent postattach call
+	 * for the same softmac. The softmac found out from the modhash
+	 * below can't vanish beneath us since this is the only place where
+	 * it is deleted.
+	 */
 	err = mod_hash_find(softmac_hash, (mod_hash_key_t)devname,
 	    (mod_hash_val_t *)&softmac);
 	ASSERT(err == 0);
 
 	mutex_enter(&softmac->smac_mutex);
+	SOFTMAC_STATE_VERIFY(softmac);
 
 	/*
 	 * Fail the predetach routine if this softmac is in-use.
+	 * Make sure these downcalls into softmac_create or softmac_destroy
+	 * don't cv_wait on any devfs related condition. Thus softmac_destroy
+	 * returns EBUSY if the asynchronous thread started in softmac_create
+	 * hasn't finished
 	 */
-	if (softmac->smac_hold_cnt != 0) {
+	if ((softmac->smac_hold_cnt != 0) ||
+	    (softmac->smac_state == SOFTMAC_ATTACH_INPROG)) {
 		softmac->smac_attached_left = softmac->smac_attachok_cnt;
 		mutex_exit(&softmac->smac_mutex);
-		rw_exit(&softmac_hash_lock);
 		return (EBUSY);
 	}
 
@@ -772,78 +897,106 @@ softmac_destroy(dev_info_t *dip, dev_t dev)
 	 */
 	if (softmac->smac_attached_left != 0) {
 		mutex_exit(&softmac->smac_mutex);
-		rw_exit(&softmac_hash_lock);
 		return (EBUSY);
 	}
 
-	if (softmac->smac_attachok_cnt != softmac->smac_cnt)
-		goto done;
-
-	/*
-	 * This is the detach for the first minor node.  Wait until all the
-	 * minor nodes are attached.
-	 */
-	while (!(softmac->smac_flags & SOFTMAC_ATTACH_DONE))
-		cv_wait(&softmac->smac_cv, &softmac->smac_mutex);
+	smac_mh = softmac->smac_mh;
+	smac_flags = softmac->smac_flags;
+	softmac->smac_state = SOFTMAC_DETACH_INPROG;
+	mutex_exit(&softmac->smac_mutex);
 
-	if (softmac->smac_mh != NULL) {
-		if (!(softmac->smac_flags & SOFTMAC_NOSUPP)) {
-			if ((err = dls_devnet_destroy(softmac->smac_mh,
-			    &linkid)) != 0) {
-				goto done;
+	if (smac_mh != NULL) {
+		/*
+		 * This is the first minor node that is being detached for this
+		 * softmac.
+		 */
+		ASSERT(softmac->smac_attachok_cnt == softmac->smac_cnt);
+		if (!(smac_flags & SOFTMAC_NOSUPP)) {
+			if ((err = dls_devnet_destroy(smac_mh, &linkid,
+			    B_FALSE)) != 0) {
+				goto error;
 			}
 		}
 		/*
 		 * If softmac_mac_register() succeeds in registering the mac
 		 * of the legacy device, unregister it.
 		 */
-		if (!(softmac->smac_flags & (SOFTMAC_GLDV3 | SOFTMAC_NOSUPP))) {
-			if ((err = mac_unregister(softmac->smac_mh)) != 0) {
-				(void) dls_devnet_create(softmac->smac_mh,
-				    linkid);
-				goto done;
+		if (!(smac_flags & (SOFTMAC_GLDV3 | SOFTMAC_NOSUPP))) {
+			if ((err = mac_disable_nowait(smac_mh)) != 0) {
+				(void) dls_devnet_create(smac_mh, linkid);
+				goto error;
 			}
+			/*
+			 * Ask softmac_notify_thread to quit, and wait for
+			 * that to be done.
+			 */
+			mutex_enter(&softmac->smac_mutex);
+			softmac->smac_flags |= SOFTMAC_NOTIFY_QUIT;
+			cv_broadcast(&softmac->smac_cv);
+			while (softmac->smac_notify_thread != NULL) {
+				cv_wait(&softmac->smac_cv,
+				    &softmac->smac_mutex);
+			}
+			mutex_exit(&softmac->smac_mutex);
+			VERIFY(mac_unregister(smac_mh) == 0);
 		}
 		softmac->smac_mh = NULL;
 	}
-	softmac->smac_flags &= ~SOFTMAC_ATTACH_DONE;
 
-done:
-	if (err == 0) {
-		/*
-		 * Free softmac_dev
-		 */
-		index = (getmajor(dev) == ddi_name_to_major("clone"));
-		softmac_dev = softmac->smac_softmac[index];
-		ASSERT(softmac_dev != NULL);
-		softmac->smac_softmac[index] = NULL;
-		kmem_free(softmac_dev, sizeof (softmac_dev_t));
-
-		if (--softmac->smac_attachok_cnt == 0) {
-			mod_hash_val_t	hashval;
-
-			err = mod_hash_remove(softmac_hash,
-			    (mod_hash_key_t)devname,
-			    (mod_hash_val_t *)&hashval);
-			ASSERT(err == 0);
+	/*
+	 * Free softmac_dev
+	 */
+	rw_enter(&softmac_hash_lock, RW_WRITER);
+	mutex_enter(&softmac->smac_mutex);
 
+	ASSERT(softmac->smac_state == SOFTMAC_DETACH_INPROG &&
+	    softmac->smac_attachok_cnt != 0);
+	softmac->smac_mh = NULL;
+	index = (getmajor(dev) == ddi_name_to_major("clone"));
+	softmac_dev = softmac->smac_softmac[index];
+	ASSERT(softmac_dev != NULL);
+	softmac->smac_softmac[index] = NULL;
+	kmem_free(softmac_dev, sizeof (softmac_dev_t));
+
+	if (--softmac->smac_attachok_cnt == 0) {
+		mod_hash_val_t	hashval;
+
+		softmac->smac_state = SOFTMAC_UNINIT;
+		if (softmac->smac_hold_cnt != 0) {
+			/*
+			 * Someone did a softmac_hold_device while we dropped
+			 * the locks. Leave the softmac itself intact which
+			 * will be reused by the reattach
+			 */
 			mutex_exit(&softmac->smac_mutex);
 			rw_exit(&softmac_hash_lock);
-
-			ASSERT(softmac->smac_taskq == NULL);
-			ASSERT(!(softmac->smac_flags & SOFTMAC_ATTACH_DONE));
-			mutex_destroy(&softmac->smac_mutex);
-			cv_destroy(&softmac->smac_cv);
-			rw_destroy(&softmac->smac_lock);
-			kmem_free(softmac, sizeof (softmac_t));
 			return (0);
 		}
-	} else {
-		softmac->smac_attached_left = softmac->smac_attachok_cnt;
-	}
+		ASSERT(softmac->smac_taskq == NULL);
 
+		err = mod_hash_remove(softmac_hash,
+		    (mod_hash_key_t)devname,
+		    (mod_hash_val_t *)&hashval);
+		ASSERT(err == 0);
+
+		mutex_exit(&softmac->smac_mutex);
+		rw_exit(&softmac_hash_lock);
+
+		mutex_destroy(&softmac->smac_mutex);
+		cv_destroy(&softmac->smac_cv);
+		kmem_free(softmac, sizeof (softmac_t));
+		return (0);
+	}
 	mutex_exit(&softmac->smac_mutex);
 	rw_exit(&softmac_hash_lock);
+	return (0);
+
+error:
+	mutex_enter(&softmac->smac_mutex);
+	softmac->smac_attached_left = softmac->smac_attachok_cnt;
+	softmac->smac_state = SOFTMAC_ATTACH_DONE;
+	cv_broadcast(&softmac->smac_cv);
+	mutex_exit(&softmac->smac_mutex);
 	return (err);
 }
 
@@ -863,17 +1016,33 @@ softmac_mac_recreate(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
 	softmac_t	*softmac = (softmac_t *)val;
 	datalink_id_t	linkid;
 	int		err;
-
-	ASSERT(RW_READ_HELD(&softmac_hash_lock));
+	softmac_walk_t	*smwp = arg;
 
 	/*
-	 * Wait for softmac_create() and softmac_mac_register() to exit.
+	 * The framework itself must not hold any locks across calls to the
+	 * mac perimeter. Thus this function does not call any framework
+	 * function that needs to grab the mac perimeter.
 	 */
+	ASSERT(RW_READ_HELD(&softmac_hash_lock));
+
+	smwp->smw_retry = B_FALSE;
 	mutex_enter(&softmac->smac_mutex);
-	while (!(softmac->smac_flags & SOFTMAC_ATTACH_DONE))
-		cv_wait(&softmac->smac_cv, &softmac->smac_mutex);
+	SOFTMAC_STATE_VERIFY(softmac);
+	if (softmac->smac_state == SOFTMAC_ATTACH_INPROG) {
+		/*
+		 * Wait till softmac_create or softmac_mac_register finishes
+		 * Hold the softmac to ensure it stays around. The wait itself
+		 * is done in the caller, since we need to drop all locks
+		 * including the mod hash's internal lock before calling
+		 * cv_wait.
+		 */
+		smwp->smw_retry = B_TRUE;
+		smwp->smw_softmac = softmac;
+		softmac->smac_hold_cnt++;
+		return (MH_WALK_TERMINATE);
+	}
 
-	if ((softmac->smac_attacherr != 0) ||
+	if ((softmac->smac_state != SOFTMAC_ATTACH_DONE) ||
 	    !(softmac->smac_flags & SOFTMAC_NEED_RECREATE)) {
 		mutex_exit(&softmac->smac_mutex);
 		return (MH_WALK_CONTINUE);
@@ -918,13 +1087,30 @@ softmac_mac_recreate(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
 void
 softmac_recreate()
 {
+	softmac_walk_t	smw;
+	softmac_t	*softmac;
+
 	/*
 	 * Walk through the softmac_hash table. Request to create the
 	 * [link name, linkid] mapping if we failed to do so.
 	 */
-	rw_enter(&softmac_hash_lock, RW_READER);
-	mod_hash_walk(softmac_hash, softmac_mac_recreate, NULL);
-	rw_exit(&softmac_hash_lock);
+	do {
+		smw.smw_retry = B_FALSE;
+		rw_enter(&softmac_hash_lock, RW_READER);
+		mod_hash_walk(softmac_hash, softmac_mac_recreate, &smw);
+		rw_exit(&softmac_hash_lock);
+		if (smw.smw_retry) {
+			/*
+			 * softmac_create or softmac_mac_register hasn't yet
+			 * finished and the softmac is not yet in the
+			 * SOFTMAC_ATTACH_DONE state.
+			 */
+			softmac = smw.smw_softmac;
+			cv_wait(&softmac->smac_cv, &softmac->smac_mutex);
+			softmac->smac_hold_cnt--;
+			mutex_exit(&softmac->smac_mutex);
+		}
+	} while (smw.smw_retry);
 }
 
 /* ARGSUSED */
@@ -1064,20 +1250,14 @@ softmac_m_open(void *arg)
 	softmac_lower_t	*slp;
 	int		err;
 
-	rw_enter(&softmac->smac_lock, RW_READER);
-	if (softmac->smac_state == SOFTMAC_READY)
-		goto done;
-	rw_exit(&softmac->smac_lock);
+	ASSERT(MAC_PERIM_HELD(softmac->smac_mh));
+	ASSERT(softmac->smac_lower_state == SOFTMAC_INITIALIZED);
 
 	if ((err = softmac_lower_setup(softmac, &slp)) != 0)
 		return (err);
 
-	rw_enter(&softmac->smac_lock, RW_WRITER);
-	ASSERT(softmac->smac_state == SOFTMAC_INITIALIZED);
 	softmac->smac_lower = slp;
-	softmac->smac_state = SOFTMAC_READY;
-done:
-	rw_exit(&softmac->smac_lock);
+	softmac->smac_lower_state = SOFTMAC_READY;
 	return (0);
 }
 
@@ -1087,7 +1267,8 @@ softmac_m_close(void *arg)
 	softmac_t	*softmac = arg;
 	softmac_lower_t	*slp;
 
-	rw_enter(&softmac->smac_lock, RW_WRITER);
+	ASSERT(MAC_PERIM_HELD(softmac->smac_mh));
+	ASSERT(softmac->smac_lower_state == SOFTMAC_READY);
 	slp = softmac->smac_lower;
 	ASSERT(slp != NULL);
 
@@ -1095,9 +1276,8 @@ softmac_m_close(void *arg)
 	 * Note that slp is destroyed when lh is closed.
 	 */
 	(void) ldi_close(slp->sl_lh, FREAD|FWRITE, kcred);
-	softmac->smac_state = SOFTMAC_INITIALIZED;
+	softmac->smac_lower_state = SOFTMAC_INITIALIZED;
 	softmac->smac_lower = NULL;
-	rw_exit(&softmac->smac_lock);
 }
 
 int
@@ -1146,7 +1326,10 @@ again:
 		 * be recreated when device fails to detach (as this device
 		 * is held).
 		 */
+		mutex_enter(&smac_global_lock);
 		rw_exit(&softmac_hash_lock);
+		cv_wait(&smac_global_cv, &smac_global_lock);
+		mutex_exit(&smac_global_lock);
 		goto again;
 	}
 
@@ -1155,17 +1338,16 @@ again:
 	 */
 	mutex_enter(&softmac->smac_mutex);
 	softmac->smac_hold_cnt++;
-	mutex_exit(&softmac->smac_mutex);
-
 	rw_exit(&softmac_hash_lock);
 
 	/*
 	 * Wait till the device is fully attached.
 	 */
-	mutex_enter(&softmac->smac_mutex);
-	while (!(softmac->smac_flags & SOFTMAC_ATTACH_DONE))
+	while (softmac->smac_state != SOFTMAC_ATTACH_DONE)
 		cv_wait(&softmac->smac_cv, &softmac->smac_mutex);
 
+	SOFTMAC_STATE_VERIFY(softmac);
+
 	if ((err = softmac->smac_attacherr) != 0)
 		softmac->smac_hold_cnt--;
 	else
diff --git a/usr/src/uts/common/io/softmac/softmac_pkt.c b/usr/src/uts/common/io/softmac/softmac_pkt.c
index 3587fa515a..4b8d7e3049 100644
--- a/usr/src/uts/common/io/softmac/softmac_pkt.c
+++ b/usr/src/uts/common/io/softmac/softmac_pkt.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/strsubr.h>
 #include <inet/led.h>
 #include <sys/softmac_impl.h>
@@ -69,40 +67,6 @@ softmac_m_tx(void *arg, mblk_t *mp)
 	return (mp);
 }
 
-/*ARGSUSED*/
-static void
-softmac_blank(void *arg, time_t ticks, uint_t count)
-{
-}
-
-void
-softmac_m_resources(void *arg)
-{
-	softmac_t	*softmac = arg;
-	softmac_lower_t	*slp = softmac->smac_lower;
-	mac_rx_fifo_t	mrf;
-
-	ASSERT((softmac->smac_state == SOFTMAC_READY) && (slp != NULL));
-
-	/*
-	 * Register rx resources and save resource handle for future reference.
-	 * Note that the mac_resources() function must be called when the lower
-	 * stream is plumbed.
-	 */
-
-	mutex_enter(&slp->sl_mutex);
-
-	mrf.mrf_type = MAC_RX_FIFO;
-	mrf.mrf_blank = softmac_blank;
-	mrf.mrf_arg = slp;
-	mrf.mrf_normal_blank_time = SOFTMAC_BLANK_TICKS;
-	mrf.mrf_normal_pkt_count = SOFTMAC_BLANK_PKT_COUNT;
-
-	slp->sl_handle =
-	    mac_resource_add(softmac->smac_mh, (mac_resource_t *)&mrf);
-
-	mutex_exit(&slp->sl_mutex);
-}
 
 void
 softmac_rput_process_data(softmac_lower_t *slp, mblk_t *mp)
@@ -125,7 +89,7 @@ softmac_rput_process_data(softmac_lower_t *slp, mblk_t *mp)
 		mp = tmp;
 	}
 
-	mac_rx(slp->sl_softmac->smac_mh, slp->sl_handle, mp);
+	mac_rx(slp->sl_softmac->smac_mh, NULL, mp);
 	return;
 
 failed:
diff --git a/usr/src/uts/common/io/strplumb.c b/usr/src/uts/common/io/strplumb.c
index ffb7753e09..27b9cc8843 100644
--- a/usr/src/uts/common/io/strplumb.c
+++ b/usr/src/uts/common/io/strplumb.c
@@ -69,7 +69,7 @@
 #include	<sys/ddi_implfuncs.h>
 
 #include	<sys/dld.h>
-#include	<sys/mac.h>
+#include	<sys/mac_client.h>
 
 /*
  * Debug Macros
diff --git a/usr/src/uts/common/io/ural/ural.c b/usr/src/uts/common/io/ural/ural.c
index 5b54d54935..b474dd8c2c 100644
--- a/usr/src/uts/common/io/ural/ural.c
+++ b/usr/src/uts/common/io/ural/ural.c
@@ -43,7 +43,7 @@
 #include <sys/modctl.h>
 #include <sys/devops.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_wifi.h>
 #include <sys/net80211.h>
 #include <sys/net80211_proto.h>
@@ -295,7 +295,6 @@ static mac_callbacks_t ural_m_callbacks = {
 	ural_m_multicst,
 	ural_m_unicst,
 	ural_m_tx,
-	NULL,		/* mc_resources; */
 	ural_m_ioctl,
 	NULL,		/* mc_getcapab */
 	NULL,
diff --git a/usr/src/uts/common/io/vnic/vnic_bcast.c b/usr/src/uts/common/io/vnic/vnic_bcast.c
deleted file mode 100644
index 28ba800fd5..0000000000
--- a/usr/src/uts/common/io/vnic/vnic_bcast.c
+++ /dev/null
@@ -1,468 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-#include <sys/conf.h>
-#include <sys/cmn_err.h>
-#include <sys/list.h>
-#include <sys/kmem.h>
-#include <sys/stream.h>
-#include <sys/modctl.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/atomic.h>
-#include <sys/stat.h>
-#include <sys/modhash.h>
-#include <sys/strsubr.h>
-#include <sys/strsun.h>
-#include <sys/mac.h>
-#include <sys/vnic.h>
-#include <sys/vnic_impl.h>
-
-/*
- * Broadcast and multicast traffic must be distributed to the VNICs
- * that are defined on top of the same underlying NIC. The set of
- * destinations to which a multicast packet must be sent is a subset
- * of all VNICs defined on top of the same NIC. A VNIC can be member
- * of more than one such subset.
- *
- * To accomodate these requirements, we introduce broadcast groups.
- * A broadcast group is associated with a broadcast or multicast
- * address. The members of a broadcast group consist of the VNICs
- * that should received copies of packets sent to the address
- * associated with the group, and are defined on top of the
- * same underlying NIC. The underlying NIC is always implicetely
- * part of the group.
- *
- * The broadcast groups defined on top of a underlying NIC are chained,
- * hanging off vnic_mac_t structures.
- */
-
-typedef struct vnic_bcast_grp_s {
-	struct vnic_bcast_grp_s	*vbg_next;
-	uint_t		vbg_refs;
-	void		*vbg_addr;
-	vnic_mac_t	*vbg_vnic_mac;
-	mac_addrtype_t	vbg_addrtype;
-	vnic_flow_t	*vbg_flow_ent;
-	vnic_t		**vbg_vnics;
-	uint_t		vbg_nvnics;
-	uint_t		vbg_nvnics_alloc;
-	uint64_t	vbg_vnics_gen;
-} vnic_bcast_grp_t;
-
-#define	VNIC_BCAST_GRP_REFHOLD(grp) {		\
-	atomic_add_32(&(grp)->vbg_refs, 1);	\
-	ASSERT((grp)->vbg_refs != 0);		\
-}
-
-#define	VNIC_BCAST_GRP_REFRELE(grp) {		\
-	ASSERT((grp)->vbg_refs != 0);		\
-	membar_exit();				\
-	if (atomic_add_32_nv(&(grp)->vbg_refs, -1) == 0)	\
-		vnic_bcast_grp_free(grp);	\
-}
-
-static kmem_cache_t *vnic_bcast_grp_cache;
-
-void
-vnic_bcast_init(void)
-{
-	vnic_bcast_grp_cache = kmem_cache_create("vnic_bcast_grp_cache",
-	    sizeof (vnic_bcast_grp_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-vnic_bcast_fini(void)
-{
-	kmem_cache_destroy(vnic_bcast_grp_cache);
-}
-
-/*
- * Free the specific broadcast group. Invoked when the last reference
- * to the group is released.
- */
-static void
-vnic_bcast_grp_free(vnic_bcast_grp_t *grp)
-{
-	vnic_mac_t *vnic_mac = grp->vbg_vnic_mac;
-
-	if (grp->vbg_addrtype == MAC_ADDRTYPE_MULTICAST) {
-		/*
-		 * The address is a multicast address, have the
-		 * underlying NIC leave the multicast group.
-		 */
-		(void) mac_multicst_remove(vnic_mac->va_mh, grp->vbg_addr);
-	}
-
-	ASSERT(grp->vbg_addr != NULL);
-	kmem_free(grp->vbg_addr, grp->vbg_vnic_mac->va_addr_len);
-
-	ASSERT(grp->vbg_vnics != NULL);
-	kmem_free(grp->vbg_vnics, grp->vbg_nvnics_alloc * sizeof (vnic_t *));
-
-	kmem_cache_free(vnic_bcast_grp_cache, grp);
-}
-
-void
-vnic_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain)
-{
-	vnic_bcast_grp_t *grp = arg1;
-	vnic_t *sender_vnic = arg2, *vnic;
-	const vnic_flow_fn_info_t *fn_info;
-	krwlock_t *grp_lock = &grp->vbg_vnic_mac->va_bcast_grp_lock;
-	uint64_t gen;
-	uint_t i;
-	mblk_t *mp_chain1;
-	vnic_mac_t *vnic_mac;
-
-	VNIC_BCAST_GRP_REFHOLD(grp);
-	rw_enter(grp_lock, RW_READER);
-
-	if (grp->vbg_nvnics == 0)
-		goto bail;
-	vnic_mac = grp->vbg_vnics[0]->vn_vnic_mac;
-
-	/*
-	 * Pass a copy of the mp chain to every VNIC except the sender
-	 * VNIC, if the packet was not received from the underlying NIC.
-	 *
-	 * The broadcast group lock across calls to the flow's callback
-	 * function, since the same group could potentially be accessed
-	 * from the same context. When the lock is reacquired, changes
-	 * to the broadcast group while the lock was released
-	 * are caught using a generation counter incremented each time
-	 * the list of VNICs associated with the broadcast group
-	 * is changed.
-	 */
-	for (i = 0; i < grp->vbg_nvnics; i++) {
-		vnic = grp->vbg_vnics[i];
-		if (vnic == sender_vnic)
-			continue;
-
-		/*
-		 * If this consumer is in promiscuous mode then it
-		 * will have already seen a copy of the packet.
-		 */
-		if (vnic->vn_promisc)
-			continue;
-		/*
-		 * It is important to hold a reference on the
-		 * flow_ent here. vnic_dev_delete() may be waiting
-		 * to delete the vnic after removing it from grp.
-		 */
-		if ((mp_chain1 = vnic_copymsgchain_cksum(mp_chain)) == NULL)
-			break;
-		/*
-		 * Fix the checksum for packets originating
-		 * from the local machine.
-		 */
-		if ((sender_vnic != NULL) &&
-		    ((mp_chain1 = vnic_fix_cksum(mp_chain1)) == NULL))
-			break;
-		VNIC_FLOW_REFHOLD(vnic->vn_flow_ent);
-		fn_info = vnic_classifier_get_fn_info(vnic->vn_flow_ent);
-		gen = grp->vbg_vnics_gen;
-		rw_exit(grp_lock);
-		(fn_info->ff_fn)(fn_info->ff_arg1, fn_info->ff_arg2, mp_chain1);
-		VNIC_FLOW_REFRELE(vnic->vn_flow_ent);
-		rw_enter(grp_lock, RW_READER);
-
-		/* update stats */
-		if (grp->vbg_addrtype == MAC_ADDRTYPE_MULTICAST)
-			vnic->vn_stat_multircv++;
-		else
-			vnic->vn_stat_brdcstrcv++;
-
-		if (grp->vbg_vnics_gen != gen) {
-			/*
-			 * The list of VNICs associated with the group
-			 * was changed while the lock was released.
-			 * Give up on the current packet.
-			 */
-			freemsgchain(mp_chain);
-			goto bail;
-		}
-	}
-
-	if (sender_vnic != NULL) {
-		/*
-		 * The packet was sent from one of the VNICs
-		 * (vnic_active_tx()), or from the active MAC
-		 * (vnic_active_tx()). In both cases, we need to send
-		 * a copy of the packet to the underlying NIC so that
-		 * it can be sent on the wire.
-		 */
-		const mac_txinfo_t *mtp = vnic_mac->va_txinfo;
-		mblk_t *rest;
-
-		if ((mp_chain1 = vnic_copymsgchain_cksum(mp_chain)) != NULL) {
-			rw_exit(grp_lock);
-			rest = mtp->mt_fn(mtp->mt_arg, mp_chain1);
-			rw_enter(grp_lock, RW_READER);
-			if (rest != NULL)
-				freemsgchain(rest);
-		}
-	}
-
-	if ((sender_vnic != (vnic_t *)-1) && (sender_vnic != NULL)) {
-		/*
-		 * Called while sending a packet from one of the VNICs.
-		 * Make sure the active interface gets its copy.
-		 */
-		mp_chain1 = (sender_vnic != NULL) ? vnic_fix_cksum(mp_chain) :
-		    mp_chain;
-		if (mp_chain1 != NULL) {
-			rw_exit(grp_lock);
-			mac_active_rx(vnic_mac->va_mh, NULL, mp_chain1);
-			rw_enter(grp_lock, RW_READER);
-		}
-	} else {
-		freemsgchain(mp_chain);
-	}
-bail:
-	rw_exit(grp_lock);
-	VNIC_BCAST_GRP_REFRELE(grp);
-}
-
-/*
- * Add the specified VNIC to the group corresponding to the specified
- * broadcast or multicast address.
- * Return 0 on success, or an errno value on failure.
- */
-int
-vnic_bcast_add(vnic_t *vnic, const uint8_t *addr, mac_addrtype_t addrtype)
-{
-	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
-	vnic_bcast_grp_t *grp = NULL, **last_grp;
-	int rc = 0;
-
-	ASSERT(addrtype == MAC_ADDRTYPE_MULTICAST ||
-	    addrtype == MAC_ADDRTYPE_BROADCAST);
-
-	rw_enter(&vnic_mac->va_bcast_grp_lock, RW_WRITER);
-
-	/*
-	 * Does a group with the specified broadcast address already
-	 * exist for the underlying NIC?
-	 */
-	last_grp = &vnic_mac->va_bcast_grp;
-	for (grp = *last_grp; grp != NULL;
-	    last_grp = &grp->vbg_next, grp = grp->vbg_next) {
-		if (bcmp(grp->vbg_addr, addr, vnic_mac->va_addr_len) == 0)
-			break;
-	}
-
-	if (grp == NULL) {
-		/*
-		 * The group does not yet exist, create it.
-		 */
-		grp = kmem_cache_alloc(vnic_bcast_grp_cache, KM_SLEEP);
-		bzero(grp, sizeof (vnic_bcast_grp_t));
-		grp->vbg_next = NULL;
-		ASSERT(grp->vbg_refs == 0);
-		grp->vbg_vnic_mac = vnic_mac;
-
-		grp->vbg_addr = kmem_zalloc(vnic_mac->va_addr_len, KM_SLEEP);
-		bcopy(addr, grp->vbg_addr, vnic_mac->va_addr_len);
-		grp->vbg_addrtype = addrtype;
-
-		/*
-		 * Add a new flow for the broadcast address.
-		 */
-		grp->vbg_flow_ent = vnic_classifier_flow_create(
-		    vnic_mac->va_addr_len, (uchar_t *)addr, grp, B_FALSE,
-		    KM_NOSLEEP);
-		if (grp->vbg_flow_ent == NULL) {
-			rc = ENOMEM;
-			goto bail;
-		}
-
-		/*
-		 * When the multicast and broadcast packet is received
-		 * by the underlying NIC, mac_rx_classify() will invoke
-		 * vnic_bcast_send() with arg2=NULL, which will cause
-		 * vnic_bcast_send() to send a copy of the packet(s)
-		 * to every VNIC defined on top of the underlying MAC.
-		 *
-		 * When the vnic_bcast_send() function is invoked from
-		 * the VNIC transmit path, it will specify the transmitting
-		 * VNIC as the arg2 value, which will allow vnic_bcast_send()
-		 * to skip that VNIC and not send it a copy of the packet.
-		 *
-		 * We program the classifier to dispatch matching broadcast
-		 * packets to vnic_bcast_send().
-		 * We need a ring allocated for this bcast flow, so that
-		 * later snooping of the underlying MAC uses the same scheme
-		 * of intercepting the ring's receiver to mac_rx_promisc().
-		 * For the economy of hardware resources, we command the MAC
-		 * classifier to use a soft ring for these broadcast and
-		 * multicast flows.
-		 */
-		vnic_classifier_flow_add(vnic_mac, grp->vbg_flow_ent,
-		    vnic_bcast_send, grp, NULL);
-
-		/*
-		 * For multicast addresses, have the underlying MAC
-		 * join the corresponsing multicast group.
-		 */
-		if ((addrtype == MAC_ADDRTYPE_MULTICAST) &&
-		    ((rc = mac_multicst_add(vnic_mac->va_mh, addr)) != 0)) {
-			vnic_classifier_flow_remove(vnic->vn_vnic_mac,
-			    grp->vbg_flow_ent);
-			vnic_classifier_flow_destroy(grp->vbg_flow_ent);
-			goto bail;
-		}
-
-		*last_grp = grp;
-	}
-
-	/*
-	 * Add the VNIC to the list of VNICs associated with the group.
-	 */
-	if (grp->vbg_nvnics_alloc == grp->vbg_nvnics) {
-		vnic_t **new_vnics;
-		uint_t new_size = grp->vbg_nvnics+1;
-
-		new_vnics = kmem_zalloc(new_size * sizeof (vnic_t *),
-		    KM_SLEEP);
-
-		if (grp->vbg_nvnics) {
-			ASSERT(grp->vbg_vnics != NULL);
-			bcopy(grp->vbg_vnics, new_vnics, grp->vbg_nvnics *
-			    sizeof (vnic_t *));
-			kmem_free(grp->vbg_vnics, grp->vbg_nvnics *
-			    sizeof (vnic_t *));
-		}
-
-		grp->vbg_vnics = new_vnics;
-		grp->vbg_nvnics_alloc = new_size;
-	}
-
-	grp->vbg_vnics[grp->vbg_nvnics++] = vnic;
-
-	/*
-	 * Since we're adding to the list of VNICs using that group,
-	 * kick the generation count, which will allow vnic_bcast_send()
-	 * to detect that condition.
-	 */
-	grp->vbg_vnics_gen++;
-
-	VNIC_BCAST_GRP_REFHOLD(grp);
-
-bail:
-	if (rc != 0 && grp != NULL) {
-		kmem_free(grp->vbg_addr, vnic_mac->va_addr_len);
-		kmem_cache_free(vnic_bcast_grp_cache, grp);
-	}
-
-	rw_exit(&vnic->vn_vnic_mac->va_bcast_grp_lock);
-	return (rc);
-}
-
-/*
- * Remove the specified VNIC from the group corresponding to
- * the specific broadcast or multicast address.
- *
- * Note: vnic_bcast_delete() calls  net_remove_flow() which
- * will call cv_wait for fe_refcnt to drop to 0. So this function
- * should not be called from interrupt or STREAMS context. The only
- * callers are vnic_dev_delete() and vnic_m_multicst() (both of
- * which are called from taskq thread context).
- */
-void
-vnic_bcast_delete(vnic_t *vnic, const uint8_t *addr)
-{
-	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
-	vnic_bcast_grp_t *grp, **prev;
-	uint_t i;
-	boolean_t removing_grp = B_FALSE;
-
-	rw_enter(&vnic_mac->va_bcast_grp_lock, RW_WRITER);
-
-	/* find the broadcast group */
-	prev = &vnic_mac->va_bcast_grp;
-	for (grp = vnic_mac->va_bcast_grp; grp != NULL; prev = &grp->vbg_next,
-	    grp = grp->vbg_next) {
-		if (bcmp(grp->vbg_addr, addr, vnic_mac->va_addr_len) == 0)
-			break;
-	}
-	ASSERT(grp != NULL);
-
-	/*
-	 * Remove the VNIC from the list of VNICs associated with that
-	 * broadcast group.
-	 *
-	 * We keep the vbg_vnics[] always compact by repacing
-	 * the removed vnic with the last non NULL element in that array.
-	 */
-
-	for (i = 0; i < grp->vbg_nvnics; i++) {
-		if (grp->vbg_vnics[i] == vnic)
-			break;
-	}
-
-	ASSERT(i < grp->vbg_nvnics);
-
-	if (i == (grp->vbg_nvnics-1)) {
-		grp->vbg_vnics[i] = NULL;
-	} else {
-		grp->vbg_vnics[i] = grp->vbg_vnics[grp->vbg_nvnics-1];
-		grp->vbg_vnics[grp->vbg_nvnics-1] = NULL;
-	}
-
-	/*
-	 * Since we're removing from the list of VNICs using that group,
-	 * kick the generation count, which will allow vnic_bcast_send()
-	 * to detect that condition.
-	 */
-	grp->vbg_vnics_gen++;
-
-	if (--grp->vbg_nvnics == 0) {
-		/*
-		 * Unlink the current group from the list of groups
-		 * defined on top of the underlying NIC. The group
-		 * structure will stay around until the last reference
-		 * is dropped.
-		 */
-		*prev = grp->vbg_next;
-		removing_grp = B_TRUE;
-	}
-
-	rw_exit(&vnic->vn_vnic_mac->va_bcast_grp_lock);
-
-	/*
-	 * If the group itself is being removed, remove the
-	 * corresponding flow from the underlying NIC.
-	 */
-	if (removing_grp) {
-		vnic_classifier_flow_remove(vnic->vn_vnic_mac,
-		    grp->vbg_flow_ent);
-		vnic_classifier_flow_destroy(grp->vbg_flow_ent);
-	}
-
-	VNIC_BCAST_GRP_REFRELE(grp);
-}
diff --git a/usr/src/uts/common/io/vnic/vnic_cl.c b/usr/src/uts/common/io/vnic/vnic_cl.c
deleted file mode 100644
index b7939f141d..0000000000
--- a/usr/src/uts/common/io/vnic/vnic_cl.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/types.h>
-#include <sys/vnic.h>
-#include <sys/vnic_impl.h>
-
-/*
- * Virtual Network Interface Card (VNIC) classification.
- *
- * The VNIC implements a software classifier which is used to steer
- * traffic (locally and externally generated) to the appropriate VNIC
- * based on MAC addresses.
- */
-
-static kmem_cache_t *vnic_flow_cache;
-static kmem_cache_t *vnic_flow_tab_cache;
-
-static void vnic_classifier_rx(void *, mac_resource_handle_t, mblk_t *);
-
-/* ARGSUSED */
-static int
-vnic_classifier_flow_tab_ctor(void *buf, void *arg, int km_flag)
-{
-	vnic_flow_tab_t *flow_tab = buf;
-
-	bzero(flow_tab, sizeof (vnic_flow_tab_t));
-	rw_init(&flow_tab->vt_lock, NULL, RW_DRIVER, NULL);
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-vnic_classifier_flow_tab_dtor(void *buf, void *arg)
-{
-	vnic_flow_tab_t *flow_tab = buf;
-
-	rw_destroy(&flow_tab->vt_lock);
-}
-
-/* ARGSUSED */
-static int
-vnic_classifier_flow_ctor(void *buf, void *arg, int km_flag)
-{
-	vnic_flow_t *flow = buf;
-
-	bzero(flow, sizeof (vnic_flow_t));
-	mutex_init(&flow->vf_lock, NULL, MUTEX_DRIVER, NULL);
-	cv_init(&flow->vf_cv, NULL, CV_DRIVER, NULL);
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-vnic_classifier_flow_dtor(void *buf, void *arg)
-{
-	vnic_flow_t *flow = buf;
-
-	ASSERT(flow->vf_refs == 0);
-	mutex_destroy(&flow->vf_lock);
-	cv_destroy(&flow->vf_cv);
-}
-
-void
-vnic_classifier_init(void)
-{
-	vnic_flow_cache = kmem_cache_create("vnic_flow_cache",
-	    sizeof (vnic_flow_t), 0, vnic_classifier_flow_ctor,
-	    vnic_classifier_flow_dtor, NULL, NULL, NULL, 0);
-	vnic_flow_tab_cache = kmem_cache_create("vnic_flow_tab_cache",
-	    sizeof (vnic_flow_tab_t), 0, vnic_classifier_flow_tab_ctor,
-	    vnic_classifier_flow_tab_dtor, NULL, NULL, NULL, 0);
-}
-
-void
-vnic_classifier_fini(void)
-{
-	kmem_cache_destroy(vnic_flow_cache);
-	kmem_cache_destroy(vnic_flow_tab_cache);
-}
-
-int
-vnic_classifier_flow_tab_init(vnic_mac_t *vnic_mac, uint_t mac_len,
-    int km_flag)
-{
-	vnic_mac->va_flow_tab = kmem_cache_alloc(vnic_flow_tab_cache, km_flag);
-	if (vnic_mac->va_flow_tab == NULL)
-		return (ENOMEM);
-	vnic_mac->va_rx_hdl = mac_rx_add(vnic_mac->va_mh, vnic_classifier_rx,
-	    vnic_mac);
-	vnic_mac->va_flow_tab->vt_addr_len = mac_len;
-	return (0);
-}
-
-void
-vnic_classifier_flow_tab_fini(vnic_mac_t *vnic_mac)
-{
-	vnic_flow_tab_t *flow_tab = vnic_mac->va_flow_tab;
-
-	ASSERT(flow_tab->vt_flow_list == NULL);
-	mac_rx_remove(vnic_mac->va_mh, vnic_mac->va_rx_hdl, B_TRUE);
-	kmem_cache_free(vnic_flow_tab_cache, flow_tab);
-	vnic_mac->va_flow_tab = NULL;
-}
-
-vnic_flow_t *
-vnic_classifier_flow_create(uint_t mac_len, uchar_t *mac_addr,
-    void *flow_cookie, boolean_t is_active, int km_flag)
-{
-	vnic_flow_t *flow;
-
-	ASSERT(mac_len <= MAXMACADDRLEN);
-
-	if ((flow = kmem_cache_alloc(vnic_flow_cache, km_flag)) == NULL)
-		return (NULL);
-
-	flow->vf_addr_len = mac_len;
-	flow->vf_cookie = flow_cookie;
-	flow->vf_clearing = B_FALSE;
-	flow->vf_is_active = is_active;
-	bcopy(mac_addr, flow->vf_addr, mac_len);
-	return (flow);
-}
-
-void
-vnic_classifier_flow_destroy(vnic_flow_t *flow)
-{
-	kmem_cache_free(vnic_flow_cache, flow);
-}
-
-void
-vnic_classifier_flow_add(vnic_mac_t *vnic_mac, vnic_flow_t *flow,
-    vnic_rx_fn_t rx_fn, void *rx_arg1, void *rx_arg2)
-{
-	vnic_flow_tab_t *flow_tab = vnic_mac->va_flow_tab;
-	vnic_flow_t **cur_flow;
-
-	ASSERT(flow->vf_addr_len == flow_tab->vt_addr_len);
-
-	/* initialize the flow structure */
-	flow->vf_fn_info.ff_fn = rx_fn;
-	flow->vf_fn_info.ff_arg1 = rx_arg1;
-	flow->vf_fn_info.ff_arg2 = rx_arg2;
-
-	/* add to the flow table */
-	rw_enter(&flow_tab->vt_lock, RW_WRITER);
-	for (cur_flow = &flow_tab->vt_flow_list;
-	    *cur_flow != NULL;
-	    cur_flow = &(*cur_flow)->vf_next)
-		;
-	*cur_flow = flow;
-	flow->vf_next = NULL;
-	rw_exit(&flow_tab->vt_lock);
-}
-
-void
-vnic_classifier_flow_remove(vnic_mac_t *vnic_mac, vnic_flow_t *flow)
-{
-	vnic_flow_tab_t *flow_tab = vnic_mac->va_flow_tab;
-	vnic_flow_t **prev, *cur;
-
-	/* unlink from list */
-	rw_enter(&flow_tab->vt_lock, RW_WRITER);
-	prev = &flow_tab->vt_flow_list;
-	for (cur = *prev; cur != NULL && cur != flow;
-	    prev = &cur->vf_next, cur = cur->vf_next)
-		;
-	*prev = cur->vf_next;
-	rw_exit(&flow_tab->vt_lock);
-
-	/* wait for all references to the flow to go away */
-	mutex_enter(&flow->vf_lock);
-	flow->vf_clearing = B_TRUE;
-	while (flow->vf_refs > 0)
-		cv_wait(&flow->vf_cv, &flow->vf_lock);
-	mutex_exit(&flow->vf_lock);
-}
-
-void
-vnic_classifier_flow_update_addr(vnic_flow_t *flow, uchar_t *mac_addr)
-{
-	bcopy(mac_addr, flow->vf_addr, flow->vf_addr_len);
-}
-
-void
-vnic_classifier_flow_update_fn(vnic_flow_t *flow, vnic_rx_fn_t fn,
-    void *arg1, void *arg2)
-{
-	flow->vf_fn_info.ff_fn = fn;
-	flow->vf_fn_info.ff_arg1 = arg1;
-	flow->vf_fn_info.ff_arg2 = arg2;
-}
-
-vnic_flow_t *
-vnic_classifier_get_flow(vnic_mac_t *vnic_mac, mblk_t *mp)
-{
-	vnic_flow_tab_t *flow_tab = vnic_mac->va_flow_tab;
-	vnic_flow_t *flow;
-	mac_header_info_t hdr_info;
-
-	if (mac_header_info(vnic_mac->va_mh, mp, &hdr_info) != 0)
-		return (NULL);
-
-	rw_enter(&flow_tab->vt_lock, RW_READER);
-	for (flow = flow_tab->vt_flow_list; flow != NULL;
-	    flow = flow->vf_next) {
-		if (bcmp(hdr_info.mhi_daddr, flow->vf_addr,
-		    flow_tab->vt_addr_len) == 0) {
-			VNIC_FLOW_REFHOLD(flow);
-			break;
-		}
-	}
-	rw_exit(&flow_tab->vt_lock);
-	return (flow);
-}
-
-void *
-vnic_classifier_get_client_cookie(vnic_flow_t *flow)
-{
-	return (flow->vf_cookie);
-}
-
-vnic_flow_fn_info_t *
-vnic_classifier_get_fn_info(vnic_flow_t *flow)
-{
-	return (&flow->vf_fn_info);
-}
-
-boolean_t
-vnic_classifier_is_active(vnic_flow_t *flow)
-{
-	return (flow->vf_is_active);
-}
-
-/*
- * Receive function registered with the MAC layer. Classifies
- * the packets, i.e. finds the flows matching the packets passed
- * as argument, and invokes the callback functions associated with
- * these flows.
- */
-/*ARGSUSED*/
-static void
-vnic_classifier_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
-{
-	vnic_mac_t *vnic_mac = arg;
-	vnic_flow_t *flow;
-	mblk_t *next_mp;
-	const vnic_flow_fn_info_t *fn_info;
-
-	while (mp != NULL) {
-		next_mp = mp->b_next;
-		mp->b_next = NULL;
-
-		vnic_promisc_rx(vnic_mac, NULL, mp);
-
-		flow = vnic_classifier_get_flow(vnic_mac, mp);
-		if (flow == NULL) {
-			freemsg(mp);
-		} else {
-			if (flow->vf_is_active) {
-				/*
-				 * Inbound packets are delivered to the
-				 * active MAC through mac_rx() of the
-				 * the NIC.
-				 */
-				freemsg(mp);
-			} else {
-				vnic_t *vnic;
-
-				fn_info = vnic_classifier_get_fn_info(flow);
-
-				/*
-				 * If the vnic to which we would
-				 * deliver this packet is in
-				 * promiscuous mode then it already
-				 * received the packet via
-				 * vnic_promisc_rx().
-				 *
-				 * XXX assumes that ff_arg2 is a
-				 * vnic_t pointer if it is non-NULL
-				 * (currently always true).
-				 */
-				vnic = (vnic_t *)fn_info->ff_arg2;
-				if ((vnic != NULL) && vnic->vn_promisc) {
-					freemsg(mp);
-				} else {
-					(fn_info->ff_fn)(fn_info->ff_arg1,
-					    fn_info->ff_arg2, mp);
-				}
-			}
-			VNIC_FLOW_REFRELE(flow);
-		}
-		mp = next_mp;
-	}
-}
diff --git a/usr/src/uts/common/io/vnic/vnic_ctl.c b/usr/src/uts/common/io/vnic/vnic_ctl.c
index a2873c9601..d4f5554949 100644
--- a/usr/src/uts/common/io/vnic/vnic_ctl.c
+++ b/usr/src/uts/common/io/vnic/vnic_ctl.c
@@ -31,62 +31,35 @@
 #include <sys/modctl.h>
 #include <sys/vnic.h>
 #include <sys/vnic_impl.h>
-#include <inet/common.h>
+#include <sys/priv_names.h>
 
 /* module description */
-#define	VNIC_LINKINFO		"VNIC MAC"
+#define	VNIC_LINKINFO		"Virtual NIC"
 
 /* device info ptr, only one for instance 0 */
 static dev_info_t *vnic_dip = NULL;
 static int vnic_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 static int vnic_attach(dev_info_t *, ddi_attach_cmd_t);
 static int vnic_detach(dev_info_t *, ddi_detach_cmd_t);
-static dld_ioc_func_t vnic_ioc_create, vnic_ioc_modify, vnic_ioc_delete,
-    vnic_ioc_info;
+
+static int vnic_ioc_create(void *, intptr_t, int, cred_t *, int *);
+static int vnic_ioc_delete(void *, intptr_t, int, cred_t *, int *);
+static int vnic_ioc_info(void *, intptr_t, int, cred_t *, int *);
+static int vnic_ioc_modify(void *, intptr_t, int, cred_t *, int *);
 
 static dld_ioc_info_t vnic_ioc_list[] = {
-	{VNIC_IOC_CREATE, DLDCOPYIN | DLDDLCONFIG, sizeof (vnic_ioc_create_t),
-	    vnic_ioc_create},
-	{VNIC_IOC_DELETE, DLDCOPYIN | DLDDLCONFIG, sizeof (vnic_ioc_delete_t),
-	    vnic_ioc_delete},
+	{VNIC_IOC_CREATE, DLDCOPYINOUT, sizeof (vnic_ioc_create_t),
+	    vnic_ioc_create, {PRIV_SYS_DL_CONFIG}},
+	{VNIC_IOC_DELETE, DLDCOPYIN, sizeof (vnic_ioc_delete_t),
+	    vnic_ioc_delete, {PRIV_SYS_DL_CONFIG}},
 	{VNIC_IOC_INFO, DLDCOPYINOUT, sizeof (vnic_ioc_info_t),
-	    vnic_ioc_info},
-	{VNIC_IOC_MODIFY, DLDCOPYIN | DLDDLCONFIG, sizeof (vnic_ioc_modify_t),
-	    vnic_ioc_modify}
-};
-
-static struct cb_ops vnic_cb_ops = {
-	nulldev,		/* open */
-	nulldev,		/* close */
-	nulldev,		/* strategy */
-	nulldev,		/* print */
-	nodev,			/* dump */
-	nodev,			/* read */
-	nodev,			/* write */
-	nodev,			/* ioctl */
-	nodev,			/* devmap */
-	nodev,			/* mmap */
-	nodev,			/* segmap */
-	nochpoll,		/* poll */
-	ddi_prop_op,		/* cb_prop_op */
-	0,			/* streamtab  */
-	D_MP			/* Driver compatibility flag */
+	    vnic_ioc_info, {NULL}},
+	{VNIC_IOC_MODIFY, DLDCOPYIN, sizeof (vnic_ioc_modify_t),
+	    vnic_ioc_modify, {PRIV_SYS_DL_CONFIG}},
 };
 
-static struct dev_ops vnic_dev_ops = {
-	DEVO_REV,		/* devo_rev */
-	0,			/* refcnt */
-	vnic_getinfo,		/* get_dev_info */
-	nulldev,		/* identify */
-	nulldev,		/* probe */
-	vnic_attach,		/* attach */
-	vnic_detach,		/* detach */
-	nodev,			/* reset */
-	&vnic_cb_ops,		/* driver operations */
-	NULL,			/* bus operations */
-	nodev,			/* dev power */
-	ddi_quiesce_not_supported,	/* dev quiesce */
-};
+DDI_DEFINE_STREAM_OPS(vnic_dev_ops, nulldev, nulldev, vnic_attach, vnic_detach,
+    nodev, vnic_getinfo, D_MP, NULL, ddi_quiesce_not_supported);
 
 static struct modldrv vnic_modldrv = {
 	&mod_driverops,		/* Type of module.  This one is a driver */
@@ -95,30 +68,32 @@ static struct modldrv vnic_modldrv = {
 };
 
 static struct modlinkage modlinkage = {
-	MODREV_1,
-	&vnic_modldrv,
-	NULL
+	MODREV_1, &vnic_modldrv, NULL
 };
 
 int
 _init(void)
 {
-	int	err;
+	int	status;
 
 	mac_init_ops(&vnic_dev_ops, "vnic");
-	if ((err = mod_install(&modlinkage)) != 0)
+	status = mod_install(&modlinkage);
+	if (status != DDI_SUCCESS)
 		mac_fini_ops(&vnic_dev_ops);
-	return (err);
+
+	return (status);
 }
 
 int
 _fini(void)
 {
-	int	err;
+	int	status;
 
-	if ((err = mod_remove(&modlinkage)) == 0)
+	status = mod_remove(&modlinkage);
+	if (status == DDI_SUCCESS)
 		mac_fini_ops(&vnic_dev_ops);
-	return (err);
+
+	return (status);
 }
 
 int
@@ -131,16 +106,12 @@ static void
 vnic_init(void)
 {
 	vnic_dev_init();
-	vnic_bcast_init();
-	vnic_classifier_init();
 }
 
 static void
 vnic_fini(void)
 {
 	vnic_dev_fini();
-	vnic_bcast_fini();
-	vnic_classifier_fini();
 }
 
 dev_info_t *
@@ -159,7 +130,7 @@ vnic_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
 		*result = vnic_dip;
 		return (DDI_SUCCESS);
 	case DDI_INFO_DEVT2INSTANCE:
-		*result = 0;
+		*result = NULL;
 		return (DDI_SUCCESS);
 	}
 	return (DDI_FAILURE);
@@ -174,14 +145,12 @@ vnic_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 			/* we only allow instance 0 to attach */
 			return (DDI_FAILURE);
 		}
-
 		if (dld_ioc_register(VNIC_IOC, vnic_ioc_list,
 		    DLDIOCCNT(vnic_ioc_list)) != 0)
 			return (DDI_FAILURE);
 
 		vnic_dip = dip;
 		vnic_init();
-
 		return (DDI_SUCCESS);
 
 	case DDI_RESUME:
@@ -208,7 +177,6 @@ vnic_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 		vnic_dip = NULL;
 		vnic_fini();
 		dld_ioc_unregister(VNIC_IOC);
-
 		return (DDI_SUCCESS);
 
 	case DDI_SUSPEND:
@@ -220,129 +188,135 @@ vnic_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 }
 
 /*
- * Process a VNIC_IOC_CREATE request.
+ * Process a VNICIOC_CREATE request.
  */
 /* ARGSUSED */
 static int
-vnic_ioc_create(void *karg, intptr_t arg, int mode, cred_t *cred)
+vnic_ioc_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
 	vnic_ioc_create_t *create_arg = karg;
-	int mac_len;
+	int err = 0, mac_len = 0, mac_slot;
 	uchar_t mac_addr[MAXMACADDRLEN];
-	datalink_id_t vnic_id, linkid;
+	uint_t mac_prefix_len;
 	vnic_mac_addr_type_t mac_addr_type;
-
-	/*
-	 * VNIC link id
-	 */
-	vnic_id = create_arg->vc_vnic_id;
-
-	/*
-	 * Linkid of the link the VNIC is defined on top of.
-	 */
-	linkid = create_arg->vc_link_id;
+	vnic_ioc_diag_t diag = VNIC_IOC_DIAG_NONE;
+	boolean_t is_anchor = create_arg->vc_flags & VNIC_IOC_CREATE_ANCHOR;
 
 	/* MAC address */
 	mac_addr_type = create_arg->vc_mac_addr_type;
-	mac_len = create_arg->vc_mac_len;
+
+	if (is_anchor)
+		goto create;
 
 	switch (mac_addr_type) {
 	case VNIC_MAC_ADDR_TYPE_FIXED:
+		mac_len = create_arg->vc_mac_len;
+		/*
+		 * Sanity check the MAC address length. vnic_dev_create()
+		 * will perform additional checks to ensure that the
+		 * address is a valid unicast address of the appropriate
+		 * length.
+		 */
+		if (mac_len == 0 || mac_len > MAXMACADDRLEN) {
+			err = EINVAL;
+			diag = VNIC_IOC_DIAG_MACADDRLEN_INVALID;
+			goto bail;
+		}
+		bcopy(create_arg->vc_mac_addr, mac_addr, MAXMACADDRLEN);
+		break;
+	case VNIC_MAC_ADDR_TYPE_FACTORY:
+		mac_slot = create_arg->vc_mac_slot;
+		/* sanity check the specified slot number */
+		if (mac_slot < 0 && mac_slot != -1) {
+			err = EINVAL;
+			diag = VNIC_IOC_DIAG_MACFACTORYSLOTINVALID;
+			goto bail;
+		}
+		break;
+	case VNIC_MAC_ADDR_TYPE_AUTO:
+		mac_slot = -1;
+		/* FALLTHROUGH */
+	case VNIC_MAC_ADDR_TYPE_RANDOM:
+		mac_prefix_len = create_arg->vc_mac_prefix_len;
+		if (mac_prefix_len > MAXMACADDRLEN) {
+			err = EINVAL;
+			diag = VNIC_IOC_DIAG_MACPREFIXLEN_INVALID;
+			goto bail;
+		}
+		mac_len = create_arg->vc_mac_len;
+		if (mac_len > MAXMACADDRLEN) {
+			err = EINVAL;
+			diag = VNIC_IOC_DIAG_MACADDRLEN_INVALID;
+			goto bail;
+		}
 		bcopy(create_arg->vc_mac_addr, mac_addr, MAXMACADDRLEN);
 		break;
+	case VNIC_MAC_ADDR_TYPE_PRIMARY:
+		/*
+		 * We will get the primary address when we add this
+		 * client
+		 */
+		break;
 	default:
-		return (ENOTSUP);
+		err = ENOTSUP;
+		goto bail;
 	}
 
-	return (vnic_dev_create(vnic_id, linkid, mac_len, mac_addr));
-}
+create:
+	err = vnic_dev_create(create_arg->vc_vnic_id, create_arg->vc_link_id,
+	    &mac_addr_type, &mac_len, mac_addr, &mac_slot, mac_prefix_len,
+	    create_arg->vc_vid, &create_arg->vc_resource_props,
+	    create_arg->vc_flags, &diag);
+	if (err != 0)
+		goto bail;
 
-/* ARGSUSED */
-static int
-vnic_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred)
-{
-	vnic_ioc_modify_t *modify_arg = karg;
-	datalink_id_t vnic_id;
-	uint_t modify_mask;
-	vnic_mac_addr_type_t mac_addr_type;
-	uint_t mac_len;
-	uchar_t mac_addr[MAXMACADDRLEN];
+	create_arg->vc_mac_addr_type = mac_addr_type;
 
-	vnic_id = modify_arg->vm_vnic_id;
-	modify_mask = modify_arg->vm_modify_mask;
+	if (is_anchor)
+		goto bail;
 
-	if (modify_mask & VNIC_IOC_MODIFY_ADDR) {
-		mac_addr_type = modify_arg->vm_mac_addr_type;
-		mac_len = modify_arg->vm_mac_len;
-		bcopy(modify_arg->vm_mac_addr, mac_addr, MAXMACADDRLEN);
+	switch (mac_addr_type) {
+	case VNIC_MAC_ADDR_TYPE_FACTORY:
+		create_arg->vc_mac_slot = mac_slot;
+		break;
+	case VNIC_MAC_ADDR_TYPE_RANDOM:
+		bcopy(mac_addr, create_arg->vc_mac_addr, MAXMACADDRLEN);
+		create_arg->vc_mac_len = mac_len;
+		break;
 	}
 
-	return (vnic_dev_modify(vnic_id, modify_mask, mac_addr_type,
-	    mac_len, mac_addr));
+bail:
+	create_arg->vc_diag = diag;
+	create_arg->vc_status = err;
+	return (err);
 }
 
 /* ARGSUSED */
 static int
-vnic_ioc_delete(void *karg, intptr_t arg, int mode, cred_t *cred)
+vnic_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
-	vnic_ioc_delete_t *delete_arg = karg;
+	vnic_ioc_modify_t *modify_arg = karg;
 
-	return (vnic_dev_delete(delete_arg->vd_vnic_id));
+	return (vnic_dev_modify(modify_arg->vm_vnic_id,
+	    modify_arg->vm_modify_mask, modify_arg->vm_mac_addr_type,
+	    modify_arg->vm_mac_len, modify_arg->vm_mac_addr,
+	    modify_arg->vm_mac_slot, &modify_arg->vm_resource_props));
 }
 
-typedef struct vnic_ioc_info_state {
-	uint32_t	bytes_left;
-	uchar_t		*where;
-	int		mode;
-} vnic_ioc_info_state_t;
-
+/* ARGSUSED */
 static int
-vnic_ioc_info_new_vnic(void *arg, datalink_id_t id,
-    vnic_mac_addr_type_t addr_type, uint_t mac_len, uint8_t *mac_addr,
-    datalink_id_t linkid)
+vnic_ioc_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
-	vnic_ioc_info_state_t *state = arg;
-	/*LINTED*/
-	vnic_ioc_info_vnic_t *vn = (vnic_ioc_info_vnic_t *)state->where;
-
-	if (state->bytes_left < sizeof (*vn))
-		return (ENOSPC);
-
-	vn->vn_vnic_id = id;
-	vn->vn_link_id = linkid;
-	vn->vn_mac_addr_type = addr_type;
-	vn->vn_mac_len = mac_len;
-	if (ddi_copyout(mac_addr, &(vn->vn_mac_addr), mac_len,
-	    state->mode) != 0)
-		return (EFAULT);
-
-	state->where += sizeof (*vn);
-	state->bytes_left -= sizeof (*vn);
+	vnic_ioc_delete_t *delete_arg = karg;
 
-	return (0);
+	return (vnic_dev_delete(delete_arg->vd_vnic_id, 0));
 }
 
 /* ARGSUSED */
 static int
-vnic_ioc_info(void *karg, intptr_t arg, int mode, cred_t *cred)
+vnic_ioc_info(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 {
-	vnic_ioc_info_t *info_argp = karg;
-	uint32_t nvnics;
-	datalink_id_t vnic_id, linkid;
-	vnic_ioc_info_state_t state;
-
-	/*
-	 * ID of the vnic to return or vnic device.
-	 * If zero, the call returns information
-	 * regarding all vnics currently defined.
-	 */
-	vnic_id = info_argp->vi_vnic_id;
-	linkid = info_argp->vi_linkid;
-
-	state.bytes_left = info_argp->vi_size;
-	state.where = (uchar_t *)(arg + sizeof (vnic_ioc_info_t));
-	state.mode = mode;
-
-	return (vnic_info(&nvnics, vnic_id, linkid, &state,
-	    vnic_ioc_info_new_vnic));
+	vnic_ioc_info_t *info_arg = karg;
+
+	return (vnic_info(&info_arg->vi_info));
 }
diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c
index 7d98003a17..b76ddf678f 100644
--- a/usr/src/uts/common/io/vnic/vnic_dev.c
+++ b/usr/src/uts/common/io/vnic/vnic_dev.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/conf.h>
@@ -43,35 +41,50 @@
 #include <sys/strsun.h>
 #include <sys/dlpi.h>
 #include <sys/mac.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
 #include <sys/mac_ether.h>
 #include <sys/dls.h>
 #include <sys/pattr.h>
+#include <sys/time.h>
+#include <sys/vlan.h>
 #include <sys/vnic.h>
 #include <sys/vnic_impl.h>
-#include <sys/gld.h>
-#include <inet/ip.h>
+#include <sys/mac_flow_impl.h>
 #include <inet/ip_impl.h>
 
+/*
+ * Note that for best performance, the VNIC is a passthrough design.
+ * For each VNIC corresponds a MAC client of the underlying MAC (lower MAC).
+ * This MAC client is opened by the VNIC driver at VNIC creation,
+ * and closed when the VNIC is deleted.
+ * When a MAC client of the VNIC itself opens a VNIC, the MAC layer
+ * (upper MAC) detects that the MAC being opened is a VNIC. Instead
+ * of allocating a new MAC client, it asks the VNIC driver to return
+ * the lower MAC client handle associated with the VNIC, and that handle
+ * is returned to the upper MAC client directly. This allows access
+ * by upper MAC clients of the VNIC to have direct access to the lower
+ * MAC client for the control path and data path.
+ *
+ * Due to this passthrough, some of the entry points exported by the
+ * VNIC driver are never directly invoked. These entry points include
+ * vnic_m_start, vnic_m_stop, vnic_m_promisc, vnic_m_multicst, etc.
+ */
+
 static int vnic_m_start(void *);
 static void vnic_m_stop(void *);
 static int vnic_m_promisc(void *, boolean_t);
 static int vnic_m_multicst(void *, boolean_t, const uint8_t *);
 static int vnic_m_unicst(void *, const uint8_t *);
 static int vnic_m_stat(void *, uint_t, uint64_t *);
-static void vnic_m_resources(void *);
+static void vnic_m_ioctl(void *, queue_t *, mblk_t *);
 static mblk_t *vnic_m_tx(void *, mblk_t *);
 static boolean_t vnic_m_capab_get(void *, mac_capab_t, void *);
-static void vnic_mac_free(vnic_mac_t *);
-static uint_t vnic_info_walker(mod_hash_key_t, mod_hash_val_t *, void *);
 static void vnic_notify_cb(void *, mac_notify_type_t);
-static int vnic_modify_mac_addr(vnic_t *, uint_t, uchar_t *);
-static mblk_t *vnic_active_tx(void *, mblk_t *);
-static int vnic_promisc_set(vnic_t *, boolean_t);
 
 static kmem_cache_t	*vnic_cache;
-static kmem_cache_t	*vnic_mac_cache;
 static krwlock_t	vnic_lock;
-static kmutex_t		vnic_mac_lock;
 static uint_t		vnic_count;
 
 /* hash of VNICs (vnic_t's), keyed by VNIC id */
@@ -79,39 +92,7 @@ static mod_hash_t	*vnic_hash;
 #define	VNIC_HASHSZ	64
 #define	VNIC_HASH_KEY(vnic_id)	((mod_hash_key_t)(uintptr_t)vnic_id)
 
-/*
- * Hash of underlying open MACs (vnic_mac_t's), keyed by the string
- * "<device name><instance number>/<port number>".
- */
-static mod_hash_t	*vnic_mac_hash;
-#define	VNIC_MAC_HASHSZ	64
-
-#define	VNIC_MAC_REFHOLD(va) {			\
-	ASSERT(MUTEX_HELD(&vnic_mac_lock));	\
-	(va)->va_refs++;			\
-	ASSERT((va)->va_refs != 0);		\
-}
-
-#define	VNIC_MAC_REFRELE(va) {			\
-	ASSERT(MUTEX_HELD(&vnic_mac_lock));	\
-	ASSERT((va)->va_refs != 0);		\
-	if (--((va)->va_refs) == 0)		\
-		vnic_mac_free(va);		\
-}
-
-static uchar_t vnic_brdcst_mac[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
-
-/* used by vnic_walker */
-typedef struct vnic_info_state {
-	datalink_id_t	vs_vnic_id;
-	datalink_id_t	vs_linkid;
-	boolean_t	vs_vnic_found;
-	vnic_info_new_vnic_fn_t	vs_new_vnic_fn;
-	void		*vs_fn_arg;
-	int		vs_rc;
-} vnic_info_state_t;
-
-#define	VNIC_M_CALLBACK_FLAGS	(MC_RESOURCES | MC_GETCAPAB)
+#define	VNIC_M_CALLBACK_FLAGS	(MC_IOCTL | MC_GETCAPAB)
 
 static mac_callbacks_t vnic_m_callbacks = {
 	VNIC_M_CALLBACK_FLAGS,
@@ -122,54 +103,21 @@ static mac_callbacks_t vnic_m_callbacks = {
 	vnic_m_multicst,
 	vnic_m_unicst,
 	vnic_m_tx,
-	vnic_m_resources,
-	NULL,			/* m_ioctl */
+	vnic_m_ioctl,
 	vnic_m_capab_get
 };
 
-/* ARGSUSED */
-static int
-vnic_mac_ctor(void *buf, void *arg, int kmflag)
-{
-	vnic_mac_t *vnic_mac = buf;
-
-	bzero(vnic_mac, sizeof (vnic_mac_t));
-	rw_init(&vnic_mac->va_bcast_grp_lock, NULL, RW_DRIVER, NULL);
-	rw_init(&vnic_mac->va_promisc_lock, NULL, RW_DRIVER, NULL);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-vnic_mac_dtor(void *buf, void *arg)
-{
-	vnic_mac_t *vnic_mac = buf;
-
-	rw_destroy(&vnic_mac->va_promisc_lock);
-	rw_destroy(&vnic_mac->va_bcast_grp_lock);
-}
-
 void
 vnic_dev_init(void)
 {
 	vnic_cache = kmem_cache_create("vnic_cache",
 	    sizeof (vnic_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
-	vnic_mac_cache = kmem_cache_create("vnic_mac_cache",
-	    sizeof (vnic_mac_t), 0, vnic_mac_ctor, vnic_mac_dtor,
-	    NULL, NULL, NULL, 0);
-
 	vnic_hash = mod_hash_create_idhash("vnic_hash",
 	    VNIC_HASHSZ, mod_hash_null_valdtor);
 
-	vnic_mac_hash = mod_hash_create_idhash("vnic_mac_hash",
-	    VNIC_MAC_HASHSZ, mod_hash_null_valdtor);
-
 	rw_init(&vnic_lock, NULL, RW_DEFAULT, NULL);
 
-	mutex_init(&vnic_mac_lock, NULL, MUTEX_DEFAULT, NULL);
-
 	vnic_count = 0;
 }
 
@@ -178,11 +126,8 @@ vnic_dev_fini(void)
 {
 	ASSERT(vnic_count == 0);
 
-	mutex_destroy(&vnic_mac_lock);
 	rw_destroy(&vnic_lock);
-	mod_hash_destroy_idhash(vnic_mac_hash);
 	mod_hash_destroy_idhash(vnic_hash);
-	kmem_cache_destroy(vnic_mac_cache);
 	kmem_cache_destroy(vnic_cache);
 }
 
@@ -192,526 +137,162 @@ vnic_dev_count(void)
 	return (vnic_count);
 }
 
-static int
-vnic_mac_open(datalink_id_t linkid, vnic_mac_t **vmp)
-{
-	int err;
-	vnic_mac_t *vnic_mac = NULL;
-	const mac_info_t *mip;
-
-	*vmp = NULL;
-
-	mutex_enter(&vnic_mac_lock);
-
-	err = mod_hash_find(vnic_mac_hash, (mod_hash_key_t)(uintptr_t)linkid,
-	    (mod_hash_val_t *)&vnic_mac);
-	if (err == 0) {
-		/* this MAC is already opened, increment reference count */
-		VNIC_MAC_REFHOLD(vnic_mac);
-		mutex_exit(&vnic_mac_lock);
-		*vmp = vnic_mac;
-		return (0);
-	}
-
-	vnic_mac = kmem_cache_alloc(vnic_mac_cache, KM_SLEEP);
-	if ((err = mac_open_by_linkid(linkid, &vnic_mac->va_mh)) != 0) {
-		vnic_mac->va_mh = NULL;
-		goto bail;
-	}
-
-	/*
-	 * For now, we do not support VNICs over legacy drivers.  This will
-	 * soon be changed.
-	 */
-	if (mac_is_legacy(vnic_mac->va_mh)) {
-		err = ENOTSUP;
-		goto bail;
-	}
-
-	/* only ethernet support, for now */
-	mip = mac_info(vnic_mac->va_mh);
-	if (mip->mi_media != DL_ETHER) {
-		err = ENOTSUP;
-		goto bail;
-	}
-	if (mip->mi_media != mip->mi_nativemedia) {
-		err = ENOTSUP;
-		goto bail;
-	}
-
-	vnic_mac->va_linkid = linkid;
-
-	/* add entry to hash table */
-	err = mod_hash_insert(vnic_mac_hash, (mod_hash_key_t)(uintptr_t)linkid,
-	    (mod_hash_val_t)vnic_mac);
-	ASSERT(err == 0);
-
-	/* initialize the flow table associated with lower MAC */
-	vnic_mac->va_addr_len = ETHERADDRL;
-	(void) vnic_classifier_flow_tab_init(vnic_mac, vnic_mac->va_addr_len,
-	    KM_SLEEP);
-
-	vnic_mac->va_txinfo = mac_vnic_tx_get(vnic_mac->va_mh);
-	vnic_mac->va_notify_hdl = mac_notify_add(vnic_mac->va_mh,
-	    vnic_notify_cb, vnic_mac);
-
-	VNIC_MAC_REFHOLD(vnic_mac);
-	*vmp = vnic_mac;
-	mutex_exit(&vnic_mac_lock);
-	return (0);
-
-bail:
-	if (vnic_mac != NULL) {
-		if (vnic_mac->va_mh != NULL)
-			mac_close(vnic_mac->va_mh);
-		kmem_cache_free(vnic_mac_cache, vnic_mac);
+static vnic_ioc_diag_t
+vnic_mac2vnic_diag(mac_diag_t diag)
+{
+	switch (diag) {
+	case MAC_DIAG_MACADDR_NIC:
+		return (VNIC_IOC_DIAG_MACADDR_NIC);
+	case MAC_DIAG_MACADDR_INUSE:
+		return (VNIC_IOC_DIAG_MACADDR_INUSE);
+	case MAC_DIAG_MACADDR_INVALID:
+		return (VNIC_IOC_DIAG_MACADDR_INVALID);
+	case MAC_DIAG_MACADDRLEN_INVALID:
+		return (VNIC_IOC_DIAG_MACADDRLEN_INVALID);
+	case MAC_DIAG_MACFACTORYSLOTINVALID:
+		return (VNIC_IOC_DIAG_MACFACTORYSLOTINVALID);
+	case MAC_DIAG_MACFACTORYSLOTUSED:
+		return (VNIC_IOC_DIAG_MACFACTORYSLOTUSED);
+	case MAC_DIAG_MACFACTORYSLOTALLUSED:
+		return (VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED);
+	case MAC_DIAG_MACFACTORYNOTSUP:
+		return (VNIC_IOC_DIAG_MACFACTORYNOTSUP);
+	case MAC_DIAG_MACPREFIX_INVALID:
+		return (VNIC_IOC_DIAG_MACPREFIX_INVALID);
+	case MAC_DIAG_MACPREFIXLEN_INVALID:
+		return (VNIC_IOC_DIAG_MACPREFIXLEN_INVALID);
+	case MAC_DIAG_MACNO_HWRINGS:
+		return (VNIC_IOC_DIAG_NO_HWRINGS);
+	default:
+		return (VNIC_IOC_DIAG_NONE);
 	}
-	mutex_exit(&vnic_mac_lock);
-	return (err);
 }
 
-/*
- * Create a new flow for the active MAC client sharing the NIC
- * with the VNICs. This allows the unicast packets for that NIC
- * to be classified and passed up to the active MAC client. It
- * also allows packets sent from a VNIC to the active link to
- * be classified by the VNIC transmit function and delivered via
- * the MAC module locally. Returns B_TRUE on success, B_FALSE on
- * failure.
- */
 static int
-vnic_init_active_rx(vnic_mac_t *vnic_mac)
-{
-	uchar_t nic_mac_addr[MAXMACADDRLEN];
-
-	if (vnic_mac->va_active_flow != NULL)
-		return (B_TRUE);
-
-	mac_unicst_get(vnic_mac->va_mh, nic_mac_addr);
-
-	vnic_mac->va_active_flow = vnic_classifier_flow_create(
-	    vnic_mac->va_addr_len, nic_mac_addr, NULL, B_TRUE, KM_SLEEP);
-
-	vnic_classifier_flow_add(vnic_mac, vnic_mac->va_active_flow,
-	    (vnic_rx_fn_t)mac_active_rx, vnic_mac->va_mh, NULL);
-	return (B_TRUE);
-}
-
-static void
-vnic_fini_active_rx(vnic_mac_t *vnic_mac)
-{
-	if (vnic_mac->va_active_flow == NULL)
-		return;
-
-	vnic_classifier_flow_remove(vnic_mac, vnic_mac->va_active_flow);
-	vnic_classifier_flow_destroy(vnic_mac->va_active_flow);
-	vnic_mac->va_active_flow = NULL;
-}
-
-static void
-vnic_update_active_rx(vnic_mac_t *vnic_mac)
-{
-	if (vnic_mac->va_active_flow == NULL)
-		return;
-
-	vnic_fini_active_rx(vnic_mac);
-	(void) vnic_init_active_rx(vnic_mac);
-}
-
-/*
- * Copy an mblk, preserving its hardware checksum flags.
- */
-mblk_t *
-vnic_copymsg_cksum(mblk_t *mp)
-{
-	mblk_t *mp1;
-	uint32_t start, stuff, end, value, flags;
-
-	mp1 = copymsg(mp);
-	if (mp1 == NULL)
-		return (NULL);
-
-	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
-	(void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
-	    flags, KM_NOSLEEP);
-
-	return (mp1);
-}
-
-/*
- * Copy an mblk chain, presenting the hardware checksum flags of the
- * individual mblks.
- */
-mblk_t *
-vnic_copymsgchain_cksum(mblk_t *mp)
-{
-	mblk_t *nmp = NULL;
-	mblk_t **nmpp = &nmp;
-
-	for (; mp != NULL; mp = mp->b_next) {
-		if ((*nmpp = vnic_copymsg_cksum(mp)) == NULL) {
-			freemsgchain(nmp);
-			return (NULL);
-		}
-
-		nmpp = &((*nmpp)->b_next);
-	}
-
-	return (nmp);
-}
-
-
-/*
- * Process the specified mblk chain for proper handling of hardware
- * checksum offload. This routine is invoked for loopback VNIC traffic.
- * The function handles a NULL mblk chain passed as argument.
- */
-mblk_t *
-vnic_fix_cksum(mblk_t *mp_chain)
+vnic_unicast_add(vnic_t *vnic, vnic_mac_addr_type_t vnic_addr_type,
+    int *addr_slot, uint_t prefix_len, int *addr_len_ptr_arg,
+    uint8_t *mac_addr_arg, uint16_t flags, vnic_ioc_diag_t *diag,
+    uint16_t vid)
 {
-	mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
-	uint32_t flags, start, stuff, end, value;
-
-	for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
-		uint16_t len;
-		uint32_t offset;
-		struct ether_header *ehp;
-		uint16_t sap;
+	mac_diag_t mac_diag;
+	uint16_t mac_flags = 0;
+	int err;
+	uint_t addr_len;
 
-		hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
-		    &flags);
-		if (flags == 0)
-			continue;
+	if (flags & VNIC_IOC_CREATE_NODUPCHECK)
+		mac_flags |= MAC_UNICAST_NODUPCHECK;
 
+	switch (vnic_addr_type) {
+	case VNIC_MAC_ADDR_TYPE_FIXED:
 		/*
-		 * Since the processing of checksum offload for loopback
-		 * traffic requires modification of the packet contents,
-		 * ensure sure that we are always modifying our own copy.
+		 * The MAC address value to assign to the VNIC
+		 * is already provided in mac_addr_arg. addr_len_ptr_arg
+		 * already contains the MAC address length.
 		 */
-		if (DB_REF(mp) > 1) {
-			mp1 = copymsg(mp);
-			if (mp1 == NULL)
-				continue;
-			mp1->b_next = mp->b_next;
-			mp->b_next = NULL;
-			freemsg(mp);
-			if (prev != NULL)
-				prev->b_next = mp1;
-			else
-				new_chain = mp1;
-			mp = mp1;
-		}
+		break;
 
+	case VNIC_MAC_ADDR_TYPE_RANDOM:
 		/*
-		 * Ethernet, and optionally VLAN header.
+		 * Random MAC address. There are two sub-cases:
+		 *
+		 * 1 - If mac_len == 0, a new MAC address is generated.
+		 *	The length of the MAC address to generated depends
+		 *	on the type of MAC used. The prefix to use for the MAC
+		 *	address is stored in the most significant bytes
+		 *	of the mac_addr argument, and its length is specified
+		 *	by the mac_prefix_len argument. This prefix can
+		 *	correspond to a IEEE OUI in the case of Ethernet,
+		 *	for example.
+		 *
+		 * 2 - If mac_len > 0, the address was already picked
+		 *	randomly, and is now passed back during VNIC
+		 *	re-creation. The mac_addr argument contains the MAC
+		 *	address that was generated. We distinguish this
+		 *	case from the fixed MAC address case, since we
+		 *	want the user consumers to know, when they query
+		 *	the list of VNICs, that a VNIC was assigned a
+		 *	random MAC address vs assigned a fixed address
+		 *	specified by the user.
 		 */
-		/*LINTED*/
-		ehp = (struct ether_header *)mp->b_rptr;
-		if (ntohs(ehp->ether_type) == VLAN_TPID) {
-			struct ether_vlan_header *evhp;
-
-			ASSERT(MBLKL(mp) >=
-			    sizeof (struct ether_vlan_header));
-			/*LINTED*/
-			evhp = (struct ether_vlan_header *)mp->b_rptr;
-			sap = ntohs(evhp->ether_type);
-			offset = sizeof (struct ether_vlan_header);
-		} else {
-			sap = ntohs(ehp->ether_type);
-			offset = sizeof (struct ether_header);
-		}
 
-		if (MBLKL(mp) <= offset) {
-			offset -= MBLKL(mp);
-			if (mp->b_cont == NULL) {
-				/* corrupted packet, skip it */
-				if (prev != NULL)
-					prev->b_next = mp->b_next;
-				else
-					new_chain = mp->b_next;
-				mp1 = mp->b_next;
-				mp->b_next = NULL;
-				freemsg(mp);
-				mp = mp1;
-				continue;
-			}
-			mp = mp->b_cont;
-		}
-
-		if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
-			ipha_t *ipha = NULL;
-
-			/*
-			 * In order to compute the full and header
-			 * checksums, we need to find and parse
-			 * the IP and/or ULP headers.
-			 */
-
-			sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
-
-			/*
-			 * IP header.
-			 */
-			if (sap != ETHERTYPE_IP)
-				continue;
-
-			ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
-			/*LINTED*/
-			ipha = (ipha_t *)(mp->b_rptr + offset);
-
-			if (flags & HCK_FULLCKSUM) {
-				ipaddr_t src, dst;
-				uint32_t cksum;
-				uint16_t *up;
-				uint8_t proto;
-
-				/*
-				 * Pointer to checksum field in ULP header.
-				 */
-				proto = ipha->ipha_protocol;
-				ASSERT(ipha->ipha_version_and_hdr_length ==
-				    IP_SIMPLE_HDR_VERSION);
-				if (proto == IPPROTO_TCP) {
-					/*LINTED*/
-					up = IPH_TCPH_CHECKSUMP(ipha,
-					    IP_SIMPLE_HDR_LENGTH);
-				} else {
-					ASSERT(proto == IPPROTO_UDP);
-					/*LINTED*/
-					up = IPH_UDPH_CHECKSUMP(ipha,
-					    IP_SIMPLE_HDR_LENGTH);
-				}
-
-				/*
-				 * Pseudo-header checksum.
-				 */
-				src = ipha->ipha_src;
-				dst = ipha->ipha_dst;
-				len = ntohs(ipha->ipha_length) -
-				    IP_SIMPLE_HDR_LENGTH;
-
-				cksum = (dst >> 16) + (dst & 0xFFFF) +
-				    (src >> 16) + (src & 0xFFFF);
-				cksum += htons(len);
-
-				/*
-				 * The checksum value stored in the packet needs
-				 * to be correct. Compute it here.
-				 */
-				*up = 0;
-				cksum += (((proto) == IPPROTO_UDP) ?
-				    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
-				cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
-				    offset, cksum);
-				*(up) = (uint16_t)(cksum ? cksum : ~cksum);
-
-				flags |= HCK_FULLCKSUM_OK;
-				value = 0xffff;
-			}
-
-			if (flags & HCK_IPV4_HDRCKSUM) {
-				ASSERT(ipha != NULL);
-				ipha->ipha_hdr_checksum =
-				    (uint16_t)ip_csum_hdr(ipha);
-			}
-		}
-
-		if (flags & HCK_PARTIALCKSUM) {
-			uint16_t *up, partial, cksum;
-			uchar_t *ipp; /* ptr to beginning of IP header */
-
-			if (mp->b_cont != NULL) {
-				mblk_t *mp1;
-
-				mp1 = msgpullup(mp, offset + end);
-				if (mp1 == NULL)
-					continue;
-				mp1->b_next = mp->b_next;
-				mp->b_next = NULL;
-				freemsg(mp);
-				if (prev != NULL)
-					prev->b_next = mp1;
-				else
-					new_chain = mp1;
-				mp = mp1;
-			}
-
-			ipp = mp->b_rptr + offset;
-			/*LINTED*/
-			up = (uint16_t *)((uchar_t *)ipp + stuff);
-			partial = *up;
-			*up = 0;
-
-			cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
-			    end - start, partial);
-			cksum = ~cksum;
-			*up = cksum ? cksum : ~cksum;
+		/*
+		 * If it's a pre-generated address, we're done. mac_addr_arg
+		 * and addr_len_ptr_arg already contain the MAC address
+		 * value and length.
+		 */
+		if (*addr_len_ptr_arg > 0)
+			break;
 
-			/*
-			 * Since we already computed the whole checksum,
-			 * indicate to the stack that it has already
-			 * been verified by the hardware.
-			 */
-			flags &= ~HCK_PARTIALCKSUM;
-			flags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
-			value = 0xffff;
+		/* generate a new random MAC address */
+		if ((err = mac_addr_random(vnic->vn_mch,
+		    prefix_len, mac_addr_arg, &mac_diag)) != 0) {
+			*diag = vnic_mac2vnic_diag(mac_diag);
+			return (err);
 		}
+		*addr_len_ptr_arg = mac_addr_len(vnic->vn_lower_mh);
+		break;
 
-		(void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
-		    value, flags, KM_NOSLEEP);
-	}
-
-	return (new_chain);
-}
-
-static void
-vnic_mac_close(vnic_mac_t *vnic_mac)
-{
-	mutex_enter(&vnic_mac_lock);
-	VNIC_MAC_REFRELE(vnic_mac);
-	mutex_exit(&vnic_mac_lock);
-}
-
-static void
-vnic_mac_free(vnic_mac_t *vnic_mac)
-{
-	mod_hash_val_t val;
-
-	ASSERT(MUTEX_HELD(&vnic_mac_lock));
-	vnic_fini_active_rx(vnic_mac);
-	mac_notify_remove(vnic_mac->va_mh, vnic_mac->va_notify_hdl);
-	if (vnic_mac->va_mac_set) {
-		vnic_mac->va_mac_set = B_FALSE;
-		mac_vnic_clear(vnic_mac->va_mh);
-	}
-	vnic_classifier_flow_tab_fini(vnic_mac);
-	mac_close(vnic_mac->va_mh);
-
-	(void) mod_hash_remove(vnic_mac_hash,
-	    (mod_hash_key_t)(uintptr_t)vnic_mac->va_linkid, &val);
-	ASSERT(vnic_mac == (vnic_mac_t *)val);
-
-	kmem_cache_free(vnic_mac_cache, vnic_mac);
-}
-
-/*
- * Initial VNIC receive routine. Invoked for packets that are steered
- * to a VNIC but the VNIC has not been started yet.
- */
-/* ARGSUSED */
-static void
-vnic_rx_initial(void *arg1, void *arg2, mblk_t *mp_chain)
-{
-	vnic_t *vnic = arg1;
-	mblk_t *mp;
-
-	/* update stats */
-	for (mp = mp_chain; mp != NULL; mp = mp->b_next)
-		vnic->vn_stat_ierrors++;
-	freemsgchain(mp_chain);
-}
-
-/*
- * VNIC receive routine invoked after the classifier for the VNIC
- * has been initialized and the VNIC has been started.
- */
-/* ARGSUSED */
-void
-vnic_rx(void *arg1, void *arg2, mblk_t *mp_chain)
-{
-	vnic_t *vnic = arg1;
-	mblk_t *mp;
-
-	/* update stats */
-	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
-		vnic->vn_stat_ipackets++;
-		vnic->vn_stat_rbytes += msgdsize(mp);
-	}
-
-	/* pass packet up */
-	mac_rx(vnic->vn_mh, NULL, mp_chain);
-}
-
-/*
- * Routine to create a MAC-based VNIC. Adds the passed MAC address
- * to an unused slot in the NIC if one is available. Otherwise it
- * sets the NIC in promiscuous mode and assigns the MAC address to
- * a Rx ring if available or a soft ring.
- */
-static int
-vnic_add_unicstaddr(vnic_t *vnic, mac_multi_addr_t *maddr)
-{
-	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
-	int err;
-
-	if (mac_unicst_verify(vnic_mac->va_mh, maddr->mma_addr,
-	    maddr->mma_addrlen) == B_FALSE)
-		return (EINVAL);
-
-	if (mac_vnic_capab_get(vnic_mac->va_mh, MAC_CAPAB_MULTIADDRESS,
-	    &(vnic->vn_mma_capab))) {
-		if (vnic->vn_maddr_naddrfree == 0) {
-			/*
-			 * No free address slots available.
-			 * Enable promiscuous mode.
-			 */
-			goto set_promisc;
+	case VNIC_MAC_ADDR_TYPE_FACTORY:
+		err = mac_addr_factory_reserve(vnic->vn_mch, addr_slot);
+		if (err != 0) {
+			if (err == EINVAL)
+				*diag = VNIC_IOC_DIAG_MACFACTORYSLOTINVALID;
+			if (err == EBUSY)
+				*diag = VNIC_IOC_DIAG_MACFACTORYSLOTUSED;
+			if (err == ENOSPC)
+				*diag = VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED;
+			return (err);
 		}
 
-		err = vnic->vn_maddr_add(vnic->vn_maddr_handle, maddr);
-		if (err != 0) {
-			if (err == ENOSPC) {
-				/*
-				 * There was a race to add addresses
-				 * with other multiple address consumers,
-				 * and we lost out. Use promisc mode.
-				 */
-				goto set_promisc;
-			}
+		mac_addr_factory_value(vnic->vn_lower_mh, *addr_slot,
+		    mac_addr_arg, &addr_len, NULL, NULL);
+		*addr_len_ptr_arg = addr_len;
+		break;
 
-			return (err);
+	case VNIC_MAC_ADDR_TYPE_AUTO:
+		/* first try to allocate a factory MAC address */
+		err = mac_addr_factory_reserve(vnic->vn_mch, addr_slot);
+		if (err == 0) {
+			mac_addr_factory_value(vnic->vn_lower_mh, *addr_slot,
+			    mac_addr_arg, &addr_len, NULL, NULL);
+			vnic_addr_type = VNIC_MAC_ADDR_TYPE_FACTORY;
+			*addr_len_ptr_arg = addr_len;
+			break;
 		}
 
-		vnic->vn_slot_id = maddr->mma_slot;
-		vnic->vn_multi_mac = B_TRUE;
-	} else {
 		/*
-		 * Either multiple MAC address support is not
-		 * available or all available addresses have
-		 * been used up.
+		 * Allocating a factory MAC address failed, generate a
+		 * random MAC address instead.
 		 */
-	set_promisc:
-		if ((err = mac_promisc_set(vnic_mac->va_mh, B_TRUE,
-		    MAC_DEVPROMISC)) != 0) {
+		if ((err = mac_addr_random(vnic->vn_mch,
+		    prefix_len, mac_addr_arg, &mac_diag)) != 0) {
+			*diag = vnic_mac2vnic_diag(mac_diag);
 			return (err);
 		}
-
-		vnic->vn_promisc_mac = B_TRUE;
+		*addr_len_ptr_arg = mac_addr_len(vnic->vn_lower_mh);
+		vnic_addr_type = VNIC_MAC_ADDR_TYPE_RANDOM;
+		break;
+	case VNIC_MAC_ADDR_TYPE_PRIMARY:
+		/*
+		 * We get the address here since we copy it in the
+		 * vnic's vn_addr.
+		 */
+		mac_unicast_primary_get(vnic->vn_lower_mh, mac_addr_arg);
+		*addr_len_ptr_arg = mac_addr_len(vnic->vn_lower_mh);
+		mac_flags |= MAC_UNICAST_VNIC_PRIMARY;
+		break;
 	}
-	return (err);
-}
 
-/*
- * VNIC is getting deleted. Remove the MAC address from the slot.
- * If promiscuous mode was being used, then unset the promiscuous mode.
- */
-static int
-vnic_remove_unicstaddr(vnic_t *vnic)
-{
-	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
-	int err;
-
-	if (vnic->vn_multi_mac) {
-		ASSERT(vnic->vn_promisc_mac == B_FALSE);
-		err = vnic->vn_maddr_remove(vnic->vn_maddr_handle,
-		    vnic->vn_slot_id);
-		vnic->vn_multi_mac = B_FALSE;
-	}
+	vnic->vn_addr_type = vnic_addr_type;
 
-	if (vnic->vn_promisc_mac) {
-		ASSERT(vnic->vn_multi_mac == B_FALSE);
-		err = mac_promisc_set(vnic_mac->va_mh, B_FALSE, MAC_DEVPROMISC);
-		vnic->vn_promisc_mac = B_FALSE;
+	err = mac_unicast_add(vnic->vn_mch, mac_addr_arg, mac_flags,
+	    &vnic->vn_muh, vid, &mac_diag);
+	if (err != 0) {
+		if (vnic_addr_type == VNIC_MAC_ADDR_TYPE_FACTORY) {
+			/* release factory MAC address */
+			mac_addr_factory_release(vnic->vn_mch, *addr_slot);
+		}
+		*diag = vnic_mac2vnic_diag(mac_diag);
 	}
 
 	return (err);
@@ -721,21 +302,23 @@ vnic_remove_unicstaddr(vnic_t *vnic)
  * Create a new VNIC upon request from administrator.
  * Returns 0 on success, an errno on failure.
  */
+/* ARGSUSED */
 int
-vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len,
-    uchar_t *mac_addr)
+vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
+    vnic_mac_addr_type_t *vnic_addr_type, int *mac_len, uchar_t *mac_addr,
+    int *mac_slot, uint_t mac_prefix_len, uint16_t vid,
+    mac_resource_props_t *mrp, uint32_t flags, vnic_ioc_diag_t *diag)
 {
-	vnic_t *vnic = NULL;
+	vnic_t *vnic;
 	mac_register_t *mac;
 	int err;
-	vnic_mac_t *vnic_mac;
-	mac_multi_addr_t maddr;
-	mac_txinfo_t tx_info;
+	boolean_t is_anchor = ((flags & VNIC_IOC_CREATE_ANCHOR) != 0);
+	char vnic_name[MAXNAMELEN];
+	const mac_info_t *minfop;
+	uint32_t req_hwgrp_flag = ((flags & VNIC_IOC_CREATE_REQ_HWRINGS) != 0) ?
+	    MAC_OPEN_FLAGS_REQ_HWRINGS : 0;
 
-	if (mac_len != ETHERADDRL) {
-		/* currently only ethernet NICs are supported */
-		return (EINVAL);
-	}
+	*diag = VNIC_IOC_DIAG_NONE;
 
 	rw_enter(&vnic_lock, RW_WRITER);
 
@@ -753,36 +336,86 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len,
 		return (ENOMEM);
 	}
 
-	/* open underlying MAC */
-	err = vnic_mac_open(linkid, &vnic_mac);
-	if (err != 0) {
-		kmem_cache_free(vnic_cache, vnic);
-		rw_exit(&vnic_lock);
-		return (err);
-	}
-
 	bzero(vnic, sizeof (*vnic));
-	vnic->vn_id = vnic_id;
-	vnic->vn_vnic_mac = vnic_mac;
 
+	vnic->vn_id = vnic_id;
+	vnic->vn_link_id = linkid;
 	vnic->vn_started = B_FALSE;
-	vnic->vn_promisc = B_FALSE;
-	vnic->vn_multi_mac = B_FALSE;
-	vnic->vn_bcast_grp = B_FALSE;
-
-	/* set the VNIC MAC address */
-	maddr.mma_addrlen = mac_len;
-	maddr.mma_slot = 0;
-	maddr.mma_flags = 0;
-	bcopy(mac_addr, maddr.mma_addr, mac_len);
-	if ((err = vnic_add_unicstaddr(vnic, &maddr)) != 0)
-		goto bail;
-	bcopy(mac_addr, vnic->vn_addr, mac_len);
 
-	/* set the initial VNIC capabilities */
-	if (!mac_vnic_capab_get(vnic_mac->va_mh, MAC_CAPAB_HCKSUM,
-	    &vnic->vn_hcksum_txflags))
-		vnic->vn_hcksum_txflags = 0;
+	if (!is_anchor) {
+		if (linkid == DATALINK_INVALID_LINKID) {
+			err = EINVAL;
+			goto bail;
+		}
+
+		/*
+		 * Open the lower MAC and assign its initial bandwidth and
+		 * MAC address. We do this here during VNIC creation and
+		 * do not wait until the upper MAC client open so that we
+		 * can validate the VNIC creation parameters (bandwidth,
+		 * MAC address, etc) and reserve a factory MAC address if
+		 * one was requested.
+		 */
+		err = mac_open_by_linkid(linkid, &vnic->vn_lower_mh);
+		if (err != 0)
+			goto bail;
+
+		/*
+		 * VNIC(vlan) over VNICs(vlans) is not supported.
+		 */
+		if (mac_is_vnic(vnic->vn_lower_mh)) {
+			err = EINVAL;
+			goto bail;
+		}
+
+		/* only ethernet support for now */
+		minfop = mac_info(vnic->vn_lower_mh);
+		if (minfop->mi_nativemedia != DL_ETHER) {
+			err = ENOTSUP;
+			goto bail;
+		}
+
+		(void) dls_mgmt_get_linkinfo(vnic_id, vnic_name, NULL, NULL,
+		    NULL);
+		err = mac_client_open(vnic->vn_lower_mh, &vnic->vn_mch,
+		    vnic_name, MAC_OPEN_FLAGS_IS_VNIC | req_hwgrp_flag);
+		if (err != 0)
+			goto bail;
+
+		if (mrp != NULL) {
+			err = mac_client_set_resources(vnic->vn_mch, mrp);
+			if (err != 0)
+				goto bail;
+		}
+		/* assign a MAC address to the VNIC */
+
+		err = vnic_unicast_add(vnic, *vnic_addr_type, mac_slot,
+		    mac_prefix_len, mac_len, mac_addr, flags, diag, vid);
+		if (err != 0) {
+			vnic->vn_muh = NULL;
+			if (diag != NULL && req_hwgrp_flag != 0)
+				*diag = VNIC_IOC_DIAG_NO_HWRINGS;
+			goto bail;
+		}
+
+		/* register to receive notification from underlying MAC */
+		vnic->vn_mnh = mac_notify_add(vnic->vn_lower_mh, vnic_notify_cb,
+		    vnic);
+
+		*vnic_addr_type = vnic->vn_addr_type;
+		vnic->vn_addr_len = *mac_len;
+		vnic->vn_vid = vid;
+
+		bcopy(mac_addr, vnic->vn_addr, vnic->vn_addr_len);
+
+		if (vnic->vn_addr_type == VNIC_MAC_ADDR_TYPE_FACTORY)
+			vnic->vn_slot_id = *mac_slot;
+
+		/* set the initial VNIC capabilities */
+		if (!mac_capab_get(vnic->vn_lower_mh, MAC_CAPAB_HCKSUM,
+		    &vnic->vn_hcksum_txflags))
+			vnic->vn_hcksum_txflags = 0;
+	}
 
 	/* register with the MAC module */
 	if ((mac = mac_alloc(MAC_VERSION)) == NULL)
@@ -795,27 +428,61 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len,
 	mac->m_src_addr = vnic->vn_addr;
 	mac->m_callbacks = &vnic_m_callbacks;
 
-	mac_sdu_get(vnic_mac->va_mh, &mac->m_min_sdu, &mac->m_max_sdu);
+	if (!is_anchor) {
+		/*
+		 * If this is a VNIC based VLAN, then we check for the
+		 * margin unless it has been created with the force
+		 * flag. If we are configuring a VLAN over an etherstub,
+		 * we don't check the margin even if force is not set.
+		 */
+		if (vid == 0 || (flags & VNIC_IOC_CREATE_FORCE) != 0) {
+			if (vid != VLAN_ID_NONE)
+				vnic->vn_force = B_TRUE;
+			/*
+			 * As the current margin size of the underlying mac is
+			 * used to determine the margin size of the VNIC
+			 * itself, request the underlying mac not to change
+			 * to a smaller margin size.
+			 */
+			err = mac_margin_add(vnic->vn_lower_mh,
+			    &vnic->vn_margin, B_TRUE);
+			ASSERT(err == 0);
+		} else {
+			vnic->vn_margin = VLAN_TAGSZ;
+			err = mac_margin_add(vnic->vn_lower_mh,
+			    &vnic->vn_margin, B_FALSE);
+			if (err != 0) {
+				mac_free(mac);
+				if (diag != NULL)
+					*diag = VNIC_IOC_DIAG_MACMARGIN_INVALID;
+				goto bail;
+			}
+		}
+
+		mac_sdu_get(vnic->vn_lower_mh, &mac->m_min_sdu,
+		    &mac->m_max_sdu);
+	} else {
+		vnic->vn_margin = VLAN_TAGSZ;
+		mac->m_min_sdu = 0;
+		mac->m_max_sdu = 9000;
+	}
 
-	/*
-	 * As the current margin size of the underlying mac is used to
-	 * determine the margin size of the VNIC itself, request the
-	 * underlying mac not to change to a smaller margin size.
-	 */
-	err = mac_margin_add(vnic_mac->va_mh, &(vnic->vn_margin), B_TRUE);
-	if (err != 0)
-		goto bail;
 	mac->m_margin = vnic->vn_margin;
+
 	err = mac_register(mac, &vnic->vn_mh);
 	mac_free(mac);
 	if (err != 0) {
-		VERIFY(mac_margin_remove(vnic_mac->va_mh,
+		VERIFY(is_anchor || mac_margin_remove(vnic->vn_lower_mh,
 		    vnic->vn_margin) == 0);
 		goto bail;
 	}
 
+	/* Set the VNIC's MAC in the client */
+	if (!is_anchor)
+		mac_set_upper_mac(vnic->vn_mch, vnic->vn_mh);
+
 	if ((err = dls_devnet_create(vnic->vn_mh, vnic->vn_id)) != 0) {
-		VERIFY(mac_margin_remove(vnic_mac->va_mh,
+		VERIFY(is_anchor || mac_margin_remove(vnic->vn_lower_mh,
 		    vnic->vn_margin) == 0);
 		(void) mac_unregister(vnic->vn_mh);
 		goto bail;
@@ -829,69 +496,22 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len,
 
 	rw_exit(&vnic_lock);
 
-	/* Create a flow, initialized with the MAC address of the VNIC */
-	if ((vnic->vn_flow_ent = vnic_classifier_flow_create(mac_len, mac_addr,
-	    NULL, B_FALSE, KM_SLEEP)) == NULL) {
-		(void) vnic_dev_delete(vnic_id);
-		vnic = NULL;
-		err = ENOMEM;
-		goto bail_unlocked;
-	}
-
-	vnic_classifier_flow_add(vnic_mac, vnic->vn_flow_ent, vnic_rx_initial,
-	    vnic, vnic);
-
-	/* setup VNIC to receive broadcast packets */
-	err = vnic_bcast_add(vnic, vnic_brdcst_mac, MAC_ADDRTYPE_BROADCAST);
-	if (err != 0) {
-		(void) vnic_dev_delete(vnic_id);
-		vnic = NULL;
-		goto bail_unlocked;
-	}
-	vnic->vn_bcast_grp = B_TRUE;
-
-	mutex_enter(&vnic_mac_lock);
-	if (!vnic_mac->va_mac_set) {
-		/*
-		 * We want to MAC layer to call the VNIC tx outbound
-		 * routine, so that local broadcast packets sent by
-		 * the active interface sharing the underlying NIC (if
-		 * any), can be broadcast to every VNIC.
-		 */
-		tx_info.mt_fn = vnic_active_tx;
-		tx_info.mt_arg = vnic_mac;
-		if (!mac_vnic_set(vnic_mac->va_mh, &tx_info,
-		    vnic_m_capab_get, vnic)) {
-			mutex_exit(&vnic_mac_lock);
-			(void) vnic_dev_delete(vnic_id);
-			vnic = NULL;
-			err = EBUSY;
-			goto bail_unlocked;
-		}
-		vnic_mac->va_mac_set = B_TRUE;
-	}
-	mutex_exit(&vnic_mac_lock);
-
-	/* allow passing packets to NIC's active MAC client */
-	if (!vnic_init_active_rx(vnic_mac)) {
-		(void) vnic_dev_delete(vnic_id);
-		vnic = NULL;
-		err = ENOMEM;
-		goto bail_unlocked;
-	}
-
 	return (0);
 
 bail:
-	(void) vnic_remove_unicstaddr(vnic);
-	vnic_mac_close(vnic_mac);
 	rw_exit(&vnic_lock);
-
-bail_unlocked:
-	if (vnic != NULL) {
-		kmem_cache_free(vnic_cache, vnic);
+	if (!is_anchor) {
+		if (vnic->vn_mnh != NULL)
+			(void) mac_notify_remove(vnic->vn_mnh, B_TRUE);
+		if (vnic->vn_muh != NULL)
+			(void) mac_unicast_remove(vnic->vn_mch, vnic->vn_muh);
+		if (vnic->vn_mch != NULL)
+			mac_client_close(vnic->vn_mch, MAC_CLOSE_FLAGS_IS_VNIC);
+		if (vnic->vn_lower_mh != NULL)
+			mac_close(vnic->vn_lower_mh);
 	}
 
+	kmem_cache_free(vnic_cache, vnic);
 	return (err);
 }
 
@@ -901,11 +521,10 @@ bail_unlocked:
 /* ARGSUSED */
 int
 vnic_dev_modify(datalink_id_t vnic_id, uint_t modify_mask,
-    vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr)
+    vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr,
+    uint_t mac_slot, mac_resource_props_t *mrp)
 {
 	vnic_t *vnic = NULL;
-	int rv = 0;
-	boolean_t notify_mac_addr = B_FALSE;
 
 	rw_enter(&vnic_lock, RW_WRITER);
 
@@ -915,29 +534,19 @@ vnic_dev_modify(datalink_id_t vnic_id, uint_t modify_mask,
 		return (ENOENT);
 	}
 
-	if (modify_mask & VNIC_IOC_MODIFY_ADDR) {
-		rv = vnic_modify_mac_addr(vnic, mac_len, mac_addr);
-		if (rv == 0)
-			notify_mac_addr = B_TRUE;
-	}
-
 	rw_exit(&vnic_lock);
 
-	if (notify_mac_addr)
-		mac_unicst_update(vnic->vn_mh, mac_addr);
-
-	return (rv);
+	return (0);
 }
 
+/* ARGSUSED */
 int
-vnic_dev_delete(datalink_id_t vnic_id)
+vnic_dev_delete(datalink_id_t vnic_id, uint32_t flags)
 {
 	vnic_t *vnic = NULL;
 	mod_hash_val_t val;
-	vnic_flow_t *flent;
 	datalink_id_t tmpid;
 	int rc;
-	vnic_mac_t *vnic_mac;
 
 	rw_enter(&vnic_lock, RW_WRITER);
 
@@ -947,7 +556,7 @@ vnic_dev_delete(datalink_id_t vnic_id)
 		return (ENOENT);
 	}
 
-	if ((rc = dls_devnet_destroy(vnic->vn_mh, &tmpid)) != 0) {
+	if ((rc = dls_devnet_destroy(vnic->vn_mh, &tmpid, B_TRUE)) != 0) {
 		rw_exit(&vnic_lock);
 		return (rc);
 	}
@@ -957,317 +566,136 @@ vnic_dev_delete(datalink_id_t vnic_id)
 	/*
 	 * We cannot unregister the MAC yet. Unregistering would
 	 * free up mac_impl_t which should not happen at this time.
-	 * Packets could be entering vnic_rx() through the
-	 * flow entry and so mac_impl_t cannot be NULL. So disable
-	 * mac_impl_t by calling mac_disable(). This will prevent any
-	 * new claims on mac_impl_t.
+	 * So disable mac_impl_t by calling mac_disable(). This will prevent
+	 * any new claims on mac_impl_t.
 	 */
-	if (mac_disable(vnic->vn_mh) != 0) {
+	if ((rc = mac_disable(vnic->vn_mh)) != 0) {
 		(void) dls_devnet_create(vnic->vn_mh, vnic_id);
 		rw_exit(&vnic_lock);
-		return (EBUSY);
+		return (rc);
 	}
 
 	(void) mod_hash_remove(vnic_hash, VNIC_HASH_KEY(vnic_id), &val);
 	ASSERT(vnic == (vnic_t *)val);
-
-	if (vnic->vn_bcast_grp)
-		(void) vnic_bcast_delete(vnic, vnic_brdcst_mac);
-
-	flent = vnic->vn_flow_ent;
-	if (flent != NULL) {
-		/*
-		 * vnic_classifier_flow_destroy() ensures that the
-		 * flow is no longer used.
-		 */
-		vnic_classifier_flow_remove(vnic->vn_vnic_mac, flent);
-		vnic_classifier_flow_destroy(flent);
-	}
-
-	rc = mac_margin_remove(vnic->vn_vnic_mac->va_mh, vnic->vn_margin);
-	ASSERT(rc == 0);
-	rc = mac_unregister(vnic->vn_mh);
-	ASSERT(rc == 0);
-	(void) vnic_remove_unicstaddr(vnic);
-	vnic_mac = vnic->vn_vnic_mac;
-	kmem_cache_free(vnic_cache, vnic);
 	vnic_count--;
 	rw_exit(&vnic_lock);
-	vnic_mac_close(vnic_mac);
-	return (0);
-}
-
-/*
- * For the specified packet chain, return a sub-chain to be sent
- * and the transmit function to be used to send the packet. Also
- * return a pointer to the sub-chain of packets that should
- * be re-classified. If the function returns NULL, the packet
- * should be sent using the underlying NIC.
- */
-static vnic_flow_t *
-vnic_classify(vnic_mac_t *vnic_mac, mblk_t *mp, mblk_t **mp_chain_rest)
-{
-	vnic_flow_t *flow_ent;
-
-	/* one packet at a time */
-	*mp_chain_rest = mp->b_next;
-	mp->b_next = NULL;
-
-	/* do classification on the packet */
-	flow_ent = vnic_classifier_get_flow(vnic_mac, mp);
 
-	return (flow_ent);
-}
-
-/*
- * Send a packet chain to a local VNIC or an active MAC client.
- */
-static void
-vnic_local_tx(vnic_mac_t *vnic_mac, vnic_flow_t *flow_ent, mblk_t *mp_chain)
-{
-	mblk_t *mp1;
-	const vnic_flow_fn_info_t *fn_info;
-	vnic_t *vnic;
-
-	if (!vnic_classifier_is_active(flow_ent) &&
-	    mac_promisc_get(vnic_mac->va_mh, MAC_PROMISC)) {
-		/*
-		 * If the MAC is in promiscous mode,
-		 * send a copy of the active client.
-		 */
-		if ((mp1 = vnic_copymsgchain_cksum(mp_chain)) == NULL)
-			goto sendit;
-		if ((mp1 = vnic_fix_cksum(mp1)) == NULL)
-			goto sendit;
-		mac_active_rx(vnic_mac->va_mh, NULL, mp1);
-	}
-sendit:
-	fn_info = vnic_classifier_get_fn_info(flow_ent);
 	/*
-	 * If the vnic to which we would deliver this packet is in
-	 * promiscuous mode then it already received the packet via
-	 * vnic_promisc_rx().
-	 *
-	 * XXX assumes that ff_arg2 is a vnic_t pointer if it is
-	 * non-NULL (currently always true).
+	 * XXX-nicolas shouldn't have a void cast here, if it's
+	 * expected that the function will never fail, then we should
+	 * have an ASSERT().
 	 */
-	vnic = (vnic_t *)fn_info->ff_arg2;
-	if ((vnic != NULL) && vnic->vn_promisc)
-		freemsg(mp_chain);
-	else if ((mp1 = vnic_fix_cksum(mp_chain)) != NULL)
-		(fn_info->ff_fn)(fn_info->ff_arg1, fn_info->ff_arg2, mp1);
-}
+	(void) mac_unregister(vnic->vn_mh);
 
-/*
- * This function is invoked when a MAC client needs to send a packet
- * to a NIC which is shared by VNICs. It is passed to the MAC layer
- * by a call to mac_vnic_set() when the NIC is opened, and is returned
- * to MAC clients by mac_tx_get() when VNICs are present.
- */
-mblk_t *
-vnic_active_tx(void *arg, mblk_t *mp_chain)
-{
-	vnic_mac_t *vnic_mac = arg;
-	mblk_t *mp, *extra_mp = NULL;
-	vnic_flow_t *flow_ent;
-	void *flow_cookie;
-	const mac_txinfo_t *mtp = vnic_mac->va_txinfo;
-
-	for (mp = mp_chain; mp != NULL; mp = extra_mp) {
-		mblk_t *next;
-
-		next = mp->b_next;
-		mp->b_next = NULL;
-
-		vnic_promisc_rx(vnic_mac, (vnic_t *)-1, mp);
-
-		flow_ent = vnic_classify(vnic_mac, mp, &extra_mp);
-		ASSERT(extra_mp == NULL);
-		extra_mp = next;
-
-		if (flow_ent != NULL) {
-			flow_cookie = vnic_classifier_get_client_cookie(
-			    flow_ent);
-			if (flow_cookie != NULL) {
-				/*
-				 * Send a copy to every VNIC defined on the
-				 * interface, as well as the underlying MAC.
-				 */
-				vnic_bcast_send(flow_cookie, (vnic_t *)-1, mp);
-			} else {
-				/*
-				 * loopback the packet to a local VNIC or
-				 * an active MAC client.
-				 */
-				vnic_local_tx(vnic_mac, flow_ent, mp);
-			}
-			VNIC_FLOW_REFRELE(flow_ent);
-			mp_chain = NULL;
-		} else {
-			/*
-			 * Non-VNIC destination, send via the underlying
-			 * NIC. In order to avoid a recursive call
-			 * to this function, we ensured that mtp points
-			 * to the unerlying NIC transmit function
-			 * by inilizating through mac_vnic_tx_get().
-			 */
-			mp_chain = mtp->mt_fn(mtp->mt_arg, mp);
-			if (mp_chain != NULL)
-				break;
+	if (vnic->vn_lower_mh != NULL) {
+		/*
+		 * Check if MAC address for the vnic was obtained from the
+		 * factory MAC addresses. If yes, release it.
+		 */
+		if (vnic->vn_addr_type == VNIC_MAC_ADDR_TYPE_FACTORY) {
+			(void) mac_addr_factory_release(vnic->vn_mch,
+			    vnic->vn_slot_id);
 		}
+		(void) mac_margin_remove(vnic->vn_lower_mh, vnic->vn_margin);
+		(void) mac_notify_remove(vnic->vn_mnh, B_TRUE);
+		(void) mac_unicast_remove(vnic->vn_mch, vnic->vn_muh);
+		mac_client_close(vnic->vn_mch, MAC_CLOSE_FLAGS_IS_VNIC);
+		mac_close(vnic->vn_lower_mh);
 	}
 
-	if ((mp_chain != NULL) && (extra_mp != NULL)) {
-		ASSERT(mp_chain->b_next == NULL);
-		mp_chain->b_next = extra_mp;
-	}
-	return (mp_chain);
+	kmem_cache_free(vnic_cache, vnic);
+	return (0);
 }
 
-/*
- * VNIC transmit function.
- */
+/* ARGSUSED */
 mblk_t *
 vnic_m_tx(void *arg, mblk_t *mp_chain)
 {
-	vnic_t *vnic = arg;
-	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
-	mblk_t *mp, *extra_mp = NULL;
-	vnic_flow_t *flow_ent;
-	void *flow_cookie;
-
 	/*
-	 * Update stats.
+	 * This function could be invoked for an anchor VNIC when sending
+	 * broadcast and multicast packets, and unicast packets which did
+	 * not match any local known destination.
 	 */
-	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
-		vnic->vn_stat_opackets++;
-		vnic->vn_stat_obytes += msgdsize(mp);
-	}
-
-	for (mp = mp_chain; mp != NULL; mp = extra_mp) {
-		mblk_t *next;
-
-		next = mp->b_next;
-		mp->b_next = NULL;
-
-		vnic_promisc_rx(vnic->vn_vnic_mac, vnic, mp);
-
-		flow_ent = vnic_classify(vnic->vn_vnic_mac, mp, &extra_mp);
-		ASSERT(extra_mp == NULL);
-		extra_mp = next;
-
-		if (flow_ent != NULL) {
-			flow_cookie = vnic_classifier_get_client_cookie(
-			    flow_ent);
-			if (flow_cookie != NULL) {
-				/*
-				 * The vnic_bcast_send function expects
-				 * to receive the sender VNIC as value
-				 * for arg2.
-				 */
-				vnic_bcast_send(flow_cookie, vnic, mp);
-			} else {
-				/*
-				 * loopback the packet to a local VNIC or
-				 * an active MAC client.
-				 */
-				vnic_local_tx(vnic_mac, flow_ent, mp);
-			}
-			VNIC_FLOW_REFRELE(flow_ent);
-			mp_chain = NULL;
-		} else {
-			/*
-			 * Non-local destination, send via the underlying
-			 * NIC.
-			 */
-			const mac_txinfo_t *mtp = vnic->vn_txinfo;
-			mp_chain = mtp->mt_fn(mtp->mt_arg, mp);
-			if (mp_chain != NULL)
-				break;
-		}
-	}
-
-	/* update stats to account for unsent packets */
-	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
-		vnic->vn_stat_opackets--;
-		vnic->vn_stat_obytes -= msgdsize(mp);
-		vnic->vn_stat_oerrors++;
-		/*
-		 * link back in the last portion not counted due to bandwidth
-		 * control.
-		 */
-		if (mp->b_next == NULL) {
-			mp->b_next = extra_mp;
-			break;
-		}
-	}
-
-	return (mp_chain);
+	freemsgchain(mp_chain);
+	return (NULL);
 }
 
-/* ARGSUSED */
+/*ARGSUSED*/
 static void
-vnic_m_resources(void *arg)
+vnic_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
 {
-	/* no resources to advertise */
+	miocnak(q, mp, 0, ENOTSUP);
 }
 
+/*
+ * This entry point cannot be passed-through, since it is invoked
+ * for the per-VNIC kstats which must be exported independently
+ * of the existence of VNIC MAC clients.
+ */
 static int
 vnic_m_stat(void *arg, uint_t stat, uint64_t *val)
 {
 	vnic_t *vnic = arg;
 	int rval = 0;
 
-	rw_enter(&vnic_lock, RW_READER);
+	if (vnic->vn_lower_mh == NULL) {
+		/*
+		 * It's an anchor VNIC, which does not have any
+		 * statistics in itself.
+		 */
+		return (ENOTSUP);
+	}
+
+	/*
+	 * ENOTSUP must be reported for unsupported stats, the VNIC
+	 * driver reports a subset of the stats that would
+	 * be returned by a real piece of hardware.
+	 */
 
 	switch (stat) {
-	case ETHER_STAT_LINK_DUPLEX:
-		*val = mac_stat_get(vnic->vn_vnic_mac->va_mh,
-		    ETHER_STAT_LINK_DUPLEX);
-		break;
+	case MAC_STAT_LINK_STATE:
+	case MAC_STAT_LINK_UP:
+	case MAC_STAT_PROMISC:
 	case MAC_STAT_IFSPEED:
-		*val = mac_stat_get(vnic->vn_vnic_mac->va_mh,
-		    MAC_STAT_IFSPEED);
-		break;
 	case MAC_STAT_MULTIRCV:
-		*val = vnic->vn_stat_multircv;
-		break;
-	case MAC_STAT_BRDCSTRCV:
-		*val = vnic->vn_stat_brdcstrcv;
-		break;
 	case MAC_STAT_MULTIXMT:
-		*val = vnic->vn_stat_multixmt;
-		break;
+	case MAC_STAT_BRDCSTRCV:
 	case MAC_STAT_BRDCSTXMT:
-		*val = vnic->vn_stat_brdcstxmt;
-		break;
+	case MAC_STAT_OPACKETS:
+	case MAC_STAT_OBYTES:
 	case MAC_STAT_IERRORS:
-		*val = vnic->vn_stat_ierrors;
-		break;
 	case MAC_STAT_OERRORS:
-		*val = vnic->vn_stat_oerrors;
-		break;
 	case MAC_STAT_RBYTES:
-		*val = vnic->vn_stat_rbytes;
-		break;
 	case MAC_STAT_IPACKETS:
-		*val = vnic->vn_stat_ipackets;
-		break;
-	case MAC_STAT_OBYTES:
-		*val = vnic->vn_stat_obytes;
-		break;
-	case MAC_STAT_OPACKETS:
-		*val = vnic->vn_stat_opackets;
+		*val = mac_client_stat_get(vnic->vn_mch, stat);
 		break;
 	default:
 		rval = ENOTSUP;
 	}
 
-	rw_exit(&vnic_lock);
 	return (rval);
 }
 
 /*
+ * Invoked by the upper MAC to retrieve the lower MAC client handle
+ * corresponding to a VNIC. A pointer to this function is obtained
+ * by the upper MAC via capability query.
+ *
+ * XXX-nicolas Note: this currently causes all VNIC MAC clients to
+ * receive the same MAC client handle for the same VNIC. This is ok
+ * as long as we have only one VNIC MAC client which sends and
+ * receives data, but we don't currently enforce this at the MAC layer.
+ */
+static void *
+vnic_mac_client_handle(void *vnic_arg)
+{
+	vnic_t *vnic = vnic_arg;
+
+	return (vnic->vn_mch);
+}
+
+
+/*
  * Return information about the specified capability.
  */
 /* ARGSUSED */
@@ -1277,8 +705,6 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
 	vnic_t *vnic = arg;
 
 	switch (cap) {
-	case MAC_CAPAB_POLL:
-		return (B_TRUE);
 	case MAC_CAPAB_HCKSUM: {
 		uint32_t *hcksum_txflags = cap_data;
 
@@ -1287,331 +713,129 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
 		    HCKSUM_INET_PARTIAL);
 		break;
 	}
+	case MAC_CAPAB_VNIC: {
+		mac_capab_vnic_t *vnic_capab = cap_data;
+
+		if (vnic->vn_lower_mh == NULL) {
+			/*
+			 * It's an anchor VNIC, we don't have an underlying
+			 * NIC and MAC client handle.
+			 */
+			return (B_FALSE);
+		}
+
+		if (vnic_capab != NULL) {
+			vnic_capab->mcv_arg = vnic;
+			vnic_capab->mcv_mac_client_handle =
+			    vnic_mac_client_handle;
+		}
+		break;
+	}
+	case MAC_CAPAB_ANCHOR_VNIC: {
+		/* since it's an anchor VNIC we don't have lower mac handle */
+		if (vnic->vn_lower_mh == NULL) {
+			ASSERT(vnic->vn_link_id == 0);
+			return (B_TRUE);
+		}
+		return (B_FALSE);
+	}
+	case MAC_CAPAB_NO_NATIVEVLAN:
+	case MAC_CAPAB_NO_ZCOPY:
+		return (B_TRUE);
 	default:
 		return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
+/* ARGSUSED */
 static int
 vnic_m_start(void *arg)
 {
-	vnic_t *vnic = arg;
-	mac_handle_t lower_mh = vnic->vn_vnic_mac->va_mh;
-	int rc;
-
-	rc = mac_start(lower_mh);
-	if (rc != 0)
-		return (rc);
-
-	vnic_classifier_flow_update_fn(vnic->vn_flow_ent, vnic_rx, vnic, vnic);
 	return (0);
 }
 
+/* ARGSUSED */
 static void
 vnic_m_stop(void *arg)
 {
-	vnic_t *vnic = arg;
-	mac_handle_t lower_mh = vnic->vn_vnic_mac->va_mh;
-
-	vnic_classifier_flow_update_fn(vnic->vn_flow_ent, vnic_rx_initial,
-	    vnic, vnic);
-	mac_stop(lower_mh);
 }
 
 /* ARGSUSED */
 static int
 vnic_m_promisc(void *arg, boolean_t on)
 {
-	vnic_t *vnic = arg;
-
-	return (vnic_promisc_set(vnic, on));
+	return (0);
 }
 
+/* ARGSUSED */
 static int
 vnic_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
 {
-	vnic_t *vnic = arg;
-	int rc = 0;
-
-	if (add)
-		rc = vnic_bcast_add(vnic, addrp, MAC_ADDRTYPE_MULTICAST);
-	else
-		vnic_bcast_delete(vnic, addrp);
-
-	return (rc);
+	return (0);
 }
 
 static int
-vnic_m_unicst(void *arg, const uint8_t *mac_addr)
+vnic_m_unicst(void *arg, const uint8_t *macaddr)
 {
 	vnic_t *vnic = arg;
-	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
-	int rv;
 
-	rw_enter(&vnic_lock, RW_WRITER);
-	rv = vnic_modify_mac_addr(vnic, vnic_mac->va_addr_len,
-	    (uchar_t *)mac_addr);
-	rw_exit(&vnic_lock);
-
-	if (rv == 0)
-		mac_unicst_update(vnic->vn_mh, mac_addr);
-	return (0);
+	return (mac_vnic_unicast_set(vnic->vn_mch, macaddr));
 }
 
 int
-vnic_info(uint_t *nvnics, datalink_id_t vnic_id, datalink_id_t linkid,
-    void *fn_arg, vnic_info_new_vnic_fn_t new_vnic_fn)
-{
-	vnic_info_state_t state;
-	int rc = 0;
-
-	rw_enter(&vnic_lock, RW_READER);
-
-	*nvnics = vnic_count;
-
-	bzero(&state, sizeof (state));
-	state.vs_vnic_id = vnic_id;
-	state.vs_linkid = linkid;
-	state.vs_new_vnic_fn = new_vnic_fn;
-	state.vs_fn_arg = fn_arg;
-
-	mod_hash_walk(vnic_hash, vnic_info_walker, &state);
-
-	if ((rc = state.vs_rc) == 0 && vnic_id != DATALINK_ALL_LINKID &&
-	    !state.vs_vnic_found)
-		rc = ENOENT;
-
-	rw_exit(&vnic_lock);
-	return (rc);
-}
-
-/*
- * Walker invoked when building a list of vnics that must be passed
- * up to user space.
- */
-/*ARGSUSED*/
-static uint_t
-vnic_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
-{
-	vnic_t *vnic;
-	vnic_info_state_t *state = arg;
-
-	if (state->vs_rc != 0)
-		return (MH_WALK_TERMINATE);	/* terminate walk */
-
-	vnic = (vnic_t *)val;
-
-	if (state->vs_vnic_id != DATALINK_ALL_LINKID &&
-	    vnic->vn_id != state->vs_vnic_id) {
-		goto bail;
-	}
-
-	state->vs_vnic_found = B_TRUE;
-
-	state->vs_rc = state->vs_new_vnic_fn(state->vs_fn_arg,
-	    vnic->vn_id, vnic->vn_addr_type, vnic->vn_vnic_mac->va_addr_len,
-	    vnic->vn_addr, vnic->vn_vnic_mac->va_linkid);
-bail:
-	return ((state->vs_rc == 0) ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
-}
-
-/*
- * vnic_notify_cb() and vnic_notify_walker() below are used to
- * process events received from an underlying NIC and, if needed,
- * forward these events to the VNICs defined on top of that NIC.
- */
-
-typedef struct vnic_notify_state {
-	mac_notify_type_t	vo_type;
-	vnic_mac_t		*vo_vnic_mac;
-} vnic_notify_state_t;
-
-/* ARGSUSED */
-static uint_t
-vnic_notify_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
+vnic_info(vnic_info_t *info)
 {
-	vnic_t *vnic = (vnic_t *)val;
-	vnic_notify_state_t *state = arg;
+	vnic_t		*vnic;
+	int		err;
 
-	/* ignore VNICs that don't use the specified underlying MAC */
-	if (vnic->vn_vnic_mac != state->vo_vnic_mac)
-		return (MH_WALK_CONTINUE);
+	rw_enter(&vnic_lock, RW_WRITER);
 
-	switch (state->vo_type) {
-	case MAC_NOTE_TX:
-		mac_tx_update(vnic->vn_mh);
-		break;
-	case MAC_NOTE_LINK:
-		/*
-		 * The VNIC link state must be up regardless of
-		 * the link state of the underlying NIC to maintain
-		 * connectivity between VNICs on the same host.
-		 */
-		mac_link_update(vnic->vn_mh, LINK_STATE_UP);
-		break;
-	case MAC_NOTE_UNICST:
-		vnic_update_active_rx(vnic->vn_vnic_mac);
-		break;
-	case MAC_NOTE_VNIC:
-		/* only for clients which share a NIC with a VNIC */
-		break;
-	case MAC_NOTE_PROMISC:
-		mutex_enter(&vnic_mac_lock);
-		vnic->vn_vnic_mac->va_txinfo = mac_vnic_tx_get(
-		    vnic->vn_vnic_mac->va_mh);
-		mutex_exit(&vnic_mac_lock);
-		break;
+	err = mod_hash_find(vnic_hash, VNIC_HASH_KEY(info->vn_vnic_id),
+	    (mod_hash_val_t *)&vnic);
+	if (err != 0) {
+		rw_exit(&vnic_lock);
+		return (ENOENT);
 	}
 
-	return (MH_WALK_CONTINUE);
-}
-
-static void
-vnic_notify_cb(void *arg, mac_notify_type_t type)
-{
-	vnic_mac_t *vnic = arg;
-	vnic_notify_state_t state;
+	info->vn_link_id = vnic->vn_link_id;
+	info->vn_mac_addr_type = vnic->vn_addr_type;
+	info->vn_mac_len = vnic->vn_addr_len;
+	bcopy(vnic->vn_addr, info->vn_mac_addr, MAXMACADDRLEN);
+	info->vn_mac_slot = vnic->vn_slot_id;
+	info->vn_mac_prefix_len = 0;
+	info->vn_vid = vnic->vn_vid;
+	info->vn_force = vnic->vn_force;
 
-	state.vo_type = type;
-	state.vo_vnic_mac = vnic;
+	bzero(&info->vn_resource_props, sizeof (mac_resource_props_t));
+	if (vnic->vn_mch != NULL)
+		mac_resource_ctl_get(vnic->vn_mch, &info->vn_resource_props);
 
-	rw_enter(&vnic_lock, RW_READER);
-	mod_hash_walk(vnic_hash, vnic_notify_walker, &state);
 	rw_exit(&vnic_lock);
-}
-
-static int
-vnic_modify_mac_addr(vnic_t *vnic, uint_t mac_len, uchar_t *mac_addr)
-{
-	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
-	vnic_flow_t *vnic_flow = vnic->vn_flow_ent;
-
-	ASSERT(RW_WRITE_HELD(&vnic_lock));
-
-	if (mac_len != vnic_mac->va_addr_len)
-		return (EINVAL);
-
-	vnic_classifier_flow_update_addr(vnic_flow, mac_addr);
 	return (0);
 }
 
-static int
-vnic_promisc_set(vnic_t *vnic, boolean_t on)
-{
-	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
-	int r = -1;
-
-	if (vnic->vn_promisc == on)
-		return (0);
-
-	if (on) {
-		if ((r = mac_promisc_set(vnic_mac->va_mh, B_TRUE,
-		    MAC_DEVPROMISC)) != 0) {
-			return (r);
-		}
-
-		rw_enter(&vnic_mac->va_promisc_lock, RW_WRITER);
-		vnic->vn_promisc_next = vnic_mac->va_promisc;
-		vnic_mac->va_promisc = vnic;
-		vnic_mac->va_promisc_gen++;
-
-		vnic->vn_promisc = B_TRUE;
-		rw_exit(&vnic_mac->va_promisc_lock);
-
-		return (0);
-	} else {
-		vnic_t *loop, *prev = NULL;
-
-		rw_enter(&vnic_mac->va_promisc_lock, RW_WRITER);
-		loop = vnic_mac->va_promisc;
-
-		while ((loop != NULL) && (loop != vnic)) {
-			prev = loop;
-			loop = loop->vn_promisc_next;
-		}
-
-		if ((loop != NULL) &&
-		    ((r = mac_promisc_set(vnic_mac->va_mh, B_FALSE,
-		    MAC_DEVPROMISC)) == 0)) {
-			if (prev != NULL)
-				prev->vn_promisc_next = loop->vn_promisc_next;
-			else
-				vnic_mac->va_promisc = loop->vn_promisc_next;
-			vnic_mac->va_promisc_gen++;
-
-			vnic->vn_promisc = B_FALSE;
-		}
-		rw_exit(&vnic_mac->va_promisc_lock);
-
-		return (r);
-	}
-}
-
-void
-vnic_promisc_rx(vnic_mac_t *vnic_mac, vnic_t *sender, mblk_t *mp)
+static void
+vnic_notify_cb(void *arg, mac_notify_type_t type)
 {
-	vnic_t *loop;
-	vnic_flow_t *flow;
-	const vnic_flow_fn_info_t *fn_info;
-	mac_header_info_t hdr_info;
-	boolean_t dst_must_match = B_TRUE;
-
-	ASSERT(mp->b_next == NULL);
-
-	rw_enter(&vnic_mac->va_promisc_lock, RW_READER);
-	if (vnic_mac->va_promisc == NULL)
-		goto done;
-
-	if (mac_header_info(vnic_mac->va_mh, mp, &hdr_info) != 0)
-		goto done;
+	vnic_t *vnic = arg;
 
 	/*
-	 * If this is broadcast or multicast then the destination
-	 * address need not match for us to deliver it.
+	 * Only the VLAN VNIC needs to be notified with primary MAC
+	 * address change.
 	 */
-	if ((hdr_info.mhi_dsttype == MAC_ADDRTYPE_BROADCAST) ||
-	    (hdr_info.mhi_dsttype == MAC_ADDRTYPE_MULTICAST))
-		dst_must_match = B_FALSE;
-
-	for (loop = vnic_mac->va_promisc;
-	    loop != NULL;
-	    loop = loop->vn_promisc_next) {
-		if (loop == sender)
-			continue;
-
-		if (dst_must_match &&
-		    (bcmp(hdr_info.mhi_daddr, loop->vn_addr,
-		    sizeof (loop->vn_addr)) != 0))
-			continue;
-
-		flow = loop->vn_flow_ent;
-		ASSERT(flow != NULL);
-
-		if (!flow->vf_is_active) {
-			mblk_t *copy;
-			uint64_t gen;
-
-			if ((copy = vnic_copymsg_cksum(mp)) == NULL)
-				break;
-			if ((sender != NULL) &&
-			    ((copy = vnic_fix_cksum(copy)) == NULL))
-				break;
-
-			VNIC_FLOW_REFHOLD(flow);
-			gen = vnic_mac->va_promisc_gen;
-			rw_exit(&vnic_mac->va_promisc_lock);
-
-			fn_info = vnic_classifier_get_fn_info(flow);
-			(fn_info->ff_fn)(fn_info->ff_arg1,
-			    fn_info->ff_arg2, copy);
-
-			VNIC_FLOW_REFRELE(flow);
-			rw_enter(&vnic_mac->va_promisc_lock, RW_READER);
-			if (vnic_mac->va_promisc_gen != gen)
-				break;
-		}
+	if (vnic->vn_addr_type != VNIC_MAC_ADDR_TYPE_PRIMARY)
+		return;
+
+	switch (type) {
+	case MAC_NOTE_UNICST:
+		/*  the unicast MAC address value */
+		mac_unicast_primary_get(vnic->vn_lower_mh, vnic->vn_addr);
+
+		/* notify its upper layer MAC about MAC address change */
+		mac_unicst_update(vnic->vn_mh, (const uint8_t *)vnic->vn_addr);
+		break;
+	default:
+		break;
 	}
-done:
-	rw_exit(&vnic_mac->va_promisc_lock);
 }
diff --git a/usr/src/uts/common/io/wpi/wpi.c b/usr/src/uts/common/io/wpi/wpi.c
index 00878f64ce..bd817f22c5 100644
--- a/usr/src/uts/common/io/wpi/wpi.c
+++ b/usr/src/uts/common/io/wpi/wpi.c
@@ -42,7 +42,7 @@
 #include <sys/modctl.h>
 #include <sys/devops.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_wifi.h>
 #include <sys/net80211.h>
 #include <sys/net80211_proto.h>
@@ -371,7 +371,6 @@ mac_callbacks_t	wpi_m_callbacks = {
 	wpi_m_multicst,
 	wpi_m_unicst,
 	wpi_m_tx,
-	NULL,
 	wpi_m_ioctl,
 	NULL,
 	NULL,
diff --git a/usr/src/uts/common/io/xge/drv/xge.c b/usr/src/uts/common/io/xge/drv/xge.c
index c41f82d706..6ee52f4262 100644
--- a/usr/src/uts/common/io/xge/drv/xge.c
+++ b/usr/src/uts/common/io/xge/drv/xge.c
@@ -65,34 +65,6 @@ ddi_device_acc_attr_t xge_dev_attr = {
 ddi_device_acc_attr_t *p_xge_dev_attr = &xge_dev_attr;
 
 /*
- * xge_event
- *
- * This function called by HAL to notify upper layer that some any
- * event been produced.
- */
-void
-xge_event(xge_queue_item_t *item)
-{
-	xgell_fifo_t *fifo = item->context;
-	xgelldev_t *lldev = fifo->lldev;
-
-	switch (item->event_type) {
-	case XGELL_EVENT_RESCHED_NEEDED:
-		if (lldev->is_initialized) {
-			if (xge_hal_channel_dtr_count(fifo->channelh)
-			    >= XGELL_TX_LEVEL_HIGH) {
-				mac_tx_update(lldev->mh);
-				xge_debug_osdep(XGE_TRACE, "%s",
-				    "mac_tx_update happened!");
-			}
-		}
-		break;
-	default:
-		break;
-	}
-}
-
-/*
  * xgell_callback_crit_err
  *
  * This function called by HAL on Serious Error event. XGE_HAL_EVENT_SERR.
@@ -139,18 +111,6 @@ xge_xpak_alarm_log(void *userdata, xge_hal_xpak_alarm_type_e type)
 }
 
 /*
- * xge_queue_produce context
- */
-static void
-xge_callback_event_queued(xge_hal_device_h devh, int event_type)
-{
-	if (event_type == XGELL_EVENT_RESCHED_NEEDED) {
-		(void) taskq_dispatch(system_taskq, xge_device_poll_now, devh,
-		    TQ_NOSLEEP);
-	}
-}
-
-/*
  * xge_driver_init_hal
  *
  * To initialize HAL portion of driver.
@@ -167,8 +127,8 @@ xge_driver_init_hal(void)
 	uld_callbacks.link_up = xgell_callback_link_up;
 	uld_callbacks.link_down = xgell_callback_link_down;
 	uld_callbacks.crit_err = xge_callback_crit_err;
-	uld_callbacks.event = xge_event;
-	uld_callbacks.event_queued = xge_callback_event_queued;
+	uld_callbacks.event = NULL;
+	uld_callbacks.event_queued = NULL;
 	uld_callbacks.before_device_poll = NULL;
 	uld_callbacks.after_device_poll = NULL;
 	uld_callbacks.sched_timer = NULL;
@@ -241,7 +201,6 @@ _info(struct modinfo *pModinfo)
 	return (mod_info(&modlinkage, pModinfo));
 }
 
-/* ARGSUSED */
 /*
  * xge_isr
  * @arg: pointer to device private strucutre(hldev)
@@ -249,6 +208,7 @@ _info(struct modinfo *pModinfo)
  * This is the ISR scheduled by the OS to indicate to the
  * driver that the receive/transmit operation is completed.
  */
+/* ARGSUSED */
 static uint_t
 xge_isr(caddr_t arg0, caddr_t arg1)
 {
@@ -308,262 +268,263 @@ xge_ring_msix_isr(caddr_t arg0, caddr_t arg1)
  * Configure single ring
  */
 static void
-xge_ring_config(dev_info_t *dev_info,
-    xge_hal_device_config_t *device_config, int num)
+xge_ring_config(dev_info_t *dev_info, xge_hal_device_config_t *device_config,
+    int index)
 {
 	char msg[MSG_SIZE];
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_configured", num);
-	device_config->ring.queue[num].configured =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_configured", index);
+	device_config->ring.queue[index].configured =
 	    ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS,
-	    msg, num < XGELL_MAX_RING_DEFAULT ? 1 : 0);
+	    msg, index < XGELL_RX_RING_NUM_MAX ? 1 : 0);
 
 	/* no point to configure it further if unconfigured */
-	if (!device_config->ring.queue[num].configured)
+	if (!device_config->ring.queue[index].configured)
 		return;
 
 #if defined(__sparc)
-	device_config->ring.queue[num].no_snoop_bits = 1;
+	device_config->ring.queue[index].no_snoop_bits = 1;
 #endif
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_max", num);
-	device_config->ring.queue[num].max =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_max", index);
+	device_config->ring.queue[index].max =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_USE_HARDCODE);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_initial", num);
-	device_config->ring.queue[num].initial =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_initial", index);
+	device_config->ring.queue[index].initial =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_USE_HARDCODE);
 
-	if (device_config->ring.queue[num].initial ==
+	if (device_config->ring.queue[index].initial ==
 	    XGE_HAL_DEFAULT_USE_HARDCODE) {
-		if (device_config->mtu > XGE_HAL_DEFAULT_MTU) {
-			device_config->ring.queue[num].initial =
-			    device_config->ring.queue[num].max =
-			    XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS_J;
-		} else {
-			device_config->ring.queue[num].initial =
-			    device_config->ring.queue[num].max =
-			    XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS_N;
-		}
+		device_config->ring.queue[index].initial =
+		    device_config->ring.queue[index].max =
+		    XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS;
 	}
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_buffer_mode", num);
-	device_config->ring.queue[num].buffer_mode =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_buffer_mode", index);
+	device_config->ring.queue[index].buffer_mode =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_RING_QUEUE_BUFFER_MODE_DEFAULT);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_dram_size_mb", num);
-	device_config->ring.queue[num].dram_size_mb =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_dram_size_mb", index);
+	device_config->ring.queue[index].dram_size_mb =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_USE_HARDCODE);
 
 	(void) xge_os_snprintf(msg, MSG_SIZE,
-	    "ring%d_backoff_interval_us", num);
-	device_config->ring.queue[num].backoff_interval_us =
+	    "ring%d_backoff_interval_us", index);
+	device_config->ring.queue[index].backoff_interval_us =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_BACKOFF_INTERVAL_US);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_max_frm_len", num);
-	device_config->ring.queue[num].max_frm_len =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_max_frm_len", index);
+	device_config->ring.queue[index].max_frm_len =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_RING_USE_MTU);
 
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_priority", num);
-	device_config->ring.queue[num].priority =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_priority", index);
+	device_config->ring.queue[index].priority =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_RING_PRIORITY);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_a", num);
-	device_config->ring.queue[num].rti.urange_a =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_a", index);
+	device_config->ring.queue[index].rti.urange_a =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_RX_URANGE_A);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_a", num);
-	device_config->ring.queue[num].rti.ufc_a =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_a", index);
+	device_config->ring.queue[index].rti.ufc_a =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_RX_UFC_A);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_b", num);
-	device_config->ring.queue[num].rti.urange_b =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_b", index);
+	device_config->ring.queue[index].rti.urange_b =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_RX_URANGE_B);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_b", num);
-	device_config->ring.queue[num].rti.ufc_b =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_b", index);
+	device_config->ring.queue[index].rti.ufc_b =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    device_config->mtu > XGE_HAL_DEFAULT_MTU ?
 	    XGE_HAL_DEFAULT_RX_UFC_B_J:
 	    XGE_HAL_DEFAULT_RX_UFC_B_N);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_c", num);
-	device_config->ring.queue[num].rti.urange_c =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_c", index);
+	device_config->ring.queue[index].rti.urange_c =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_RX_URANGE_C);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_c", num);
-	device_config->ring.queue[num].rti.ufc_c =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_c", index);
+	device_config->ring.queue[index].rti.ufc_c =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    device_config->mtu > XGE_HAL_DEFAULT_MTU ?
 	    XGE_HAL_DEFAULT_RX_UFC_C_J:
 	    XGE_HAL_DEFAULT_RX_UFC_C_N);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_d", num);
-	device_config->ring.queue[num].rti.ufc_d =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_d", index);
+	device_config->ring.queue[index].rti.ufc_d =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_RX_UFC_D);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_timer_val", num);
-	device_config->ring.queue[num].rti.timer_val_us =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_timer_val", index);
+	device_config->ring.queue[index].rti.timer_val_us =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_RX_TIMER_VAL);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_timer_ac_en", num);
-	device_config->ring.queue[num].rti.timer_ac_en =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_timer_ac_en", index);
+	device_config->ring.queue[index].rti.timer_ac_en =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_RX_TIMER_AC_EN);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_indicate_max_pkts", num);
-	device_config->ring.queue[num].indicate_max_pkts =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_indicate_max_pkts",
+	    index);
+	device_config->ring.queue[index].indicate_max_pkts =
 	    ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    (device_config->bimodal_interrupts ?
 	    XGE_HAL_DEFAULT_INDICATE_MAX_PKTS_B :
 	    XGE_HAL_DEFAULT_INDICATE_MAX_PKTS_N));
 
-	if (device_config->ring.queue[num].configured) {
-		/* enable RTH steering by default */
-		device_config->ring.queue[num].rth_en = 1;
-		device_config->rth_en = XGE_HAL_RTH_ENABLE;
-		device_config->rth_bucket_size = XGE_HAL_MAX_RTH_BUCKET_SIZE;
-		device_config->rth_spdm_en = XGE_HAL_RTH_SPDM_DISABLE;
-		device_config->rth_spdm_use_l4 = XGE_HAL_RTH_SPDM_USE_L4;
-	}
+	/*
+	 * Enable RTH steering if needed HERE!!!!
+	 */
+	if (device_config->rth_en == XGE_HAL_RTH_ENABLE)
+		device_config->ring.queue[index].rth_en = 1;
 }
 
 /*
  * Configure single fifo
  */
 static void
-xge_fifo_config(dev_info_t *dev_info,
-    xge_hal_device_config_t *device_config, int num)
+xge_fifo_config(dev_info_t *dev_info, xge_hal_device_config_t *device_config,
+    int index)
 {
 	char msg[MSG_SIZE];
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_configured", num);
-	device_config->fifo.queue[num].configured =
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_configured", index);
+	device_config->fifo.queue[index].configured =
 	    ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS,
-	    msg, num < XGELL_MAX_FIFO_DEFAULT ? 1 : 0);
+	    msg, index < XGELL_TX_RING_NUM_MAX ? 1 : 0);
 
 	/* no point to configure it further */
-	if (!device_config->fifo.queue[num].configured)
+	if (!device_config->fifo.queue[index].configured)
 		return;
 
 #if defined(__sparc)
-	device_config->fifo.queue[num].no_snoop_bits = 1;
+	device_config->fifo.queue[index].no_snoop_bits = 1;
 #endif
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_max", num);
-	device_config->fifo.queue[num].max = ddi_prop_get_int(DDI_DEV_T_ANY,
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_max", index);
+	device_config->fifo.queue[index].max = ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_USE_HARDCODE);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_initial", num);
-	device_config->fifo.queue[num].initial = ddi_prop_get_int(DDI_DEV_T_ANY,
-	    dev_info, DDI_PROP_DONTPASS, msg,
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_initial", index);
+	device_config->fifo.queue[index].initial =
+	    ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_USE_HARDCODE);
 
-	if (device_config->fifo.queue[num].initial ==
+#if 0
+	if (device_config->fifo.queue[index].initial ==
 	    XGE_HAL_DEFAULT_USE_HARDCODE) {
 		if (device_config->mtu > XGE_HAL_DEFAULT_MTU) {
-			device_config->fifo.queue[num].initial =
-			    device_config->fifo.queue[num].max =
+			device_config->fifo.queue[index].initial =
+			    device_config->fifo.queue[index].max =
 			    XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_J;
 		} else {
-			device_config->fifo.queue[num].initial =
-			    device_config->fifo.queue[num].max =
+			device_config->fifo.queue[index].initial =
+			    device_config->fifo.queue[index].max =
 			    XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_N;
 		}
 	}
+#else
+	if (device_config->fifo.queue[index].initial ==
+	    XGE_HAL_DEFAULT_USE_HARDCODE) {
+		device_config->fifo.queue[index].max =
+		    device_config->fifo.queue[index].initial =
+		    XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_A;
+	}
+#endif
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_intr", num);
-	device_config->fifo.queue[num].intr = ddi_prop_get_int(DDI_DEV_T_ANY,
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_intr", index);
+	device_config->fifo.queue[index].intr = ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_FIFO_QUEUE_INTR);
 
 	/*
 	 * TTI 0 configuration
 	 */
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_enable", num);
-	device_config->fifo.queue[num].tti[num].enabled = ddi_prop_get_int(
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_enable", index);
+	device_config->fifo.queue[index].tti[index].enabled = ddi_prop_get_int(
 	    DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, 1);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_a", num);
-	device_config->fifo.queue[num].tti[num].urange_a = ddi_prop_get_int(
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_a", index);
+	device_config->fifo.queue[index].tti[index].urange_a = ddi_prop_get_int(
 	    DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_TX_URANGE_A);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_a", num);
-	device_config->fifo.queue[num].tti[num].ufc_a = ddi_prop_get_int(
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_a", index);
+	device_config->fifo.queue[index].tti[index].ufc_a = ddi_prop_get_int(
 	    DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_TX_UFC_A);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_b", num);
-	device_config->fifo.queue[num].tti[num].urange_b = ddi_prop_get_int(
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_b", index);
+	device_config->fifo.queue[index].tti[index].urange_b = ddi_prop_get_int(
 	    DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_TX_URANGE_B);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_b", num);
-	device_config->fifo.queue[num].tti[num].ufc_b = ddi_prop_get_int(
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_b", index);
+	device_config->fifo.queue[index].tti[index].ufc_b = ddi_prop_get_int(
 	    DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_TX_UFC_B);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_c", num);
-	device_config->fifo.queue[num].tti[num].urange_c = ddi_prop_get_int(
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_c", index);
+	device_config->fifo.queue[index].tti[index].urange_c = ddi_prop_get_int(
 	    DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_TX_URANGE_C);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_c", num);
-	device_config->fifo.queue[num].tti[num].ufc_c = ddi_prop_get_int(
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_c", index);
+	device_config->fifo.queue[index].tti[index].ufc_c = ddi_prop_get_int(
 	    DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_TX_UFC_C);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_d", num);
-	device_config->fifo.queue[num].tti[num].ufc_d = ddi_prop_get_int(
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_d", index);
+	device_config->fifo.queue[index].tti[index].ufc_d = ddi_prop_get_int(
 	    DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_TX_UFC_D);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_timer_ac_en", num);
-	device_config->fifo.queue[num].tti[num].timer_ac_en = ddi_prop_get_int(
-	    DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_timer_ac_en", index);
+	device_config->fifo.queue[index].tti[index].timer_ac_en =
+	    ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_TX_TIMER_AC_EN);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_timer_val", num);
-	device_config->fifo.queue[num].tti[num].timer_val_us = ddi_prop_get_int(
-	    DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_timer_val", index);
+	device_config->fifo.queue[index].tti[index].timer_val_us =
+	    ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_TX_TIMER_VAL);
 
-	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_timer_ci_en", num);
-	device_config->fifo.queue[num].tti[num].timer_ci_en = ddi_prop_get_int(
-	    DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
+	(void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_timer_ci_en", index);
+	device_config->fifo.queue[index].tti[index].timer_ci_en =
+	    ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg,
 	    XGE_HAL_DEFAULT_TX_TIMER_CI_EN);
 }
 
@@ -577,11 +538,57 @@ xge_fifo_config(dev_info_t *dev_info,
  */
 static void
 xge_configuration_init(dev_info_t *dev_info,
-    xge_hal_device_config_t *device_config, xgell_config_t *ll_config)
+    xge_hal_device_config_t *device_config, xgell_config_t *xgell_config)
 {
 	int i, rings_configured = 0, fifos_configured = 0;
 
 	/*
+	 * Initialize link layer configuration first
+	 */
+	xgell_config->rx_dma_lowat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
+	    DDI_PROP_DONTPASS, "rx_dma_lowat", XGELL_RX_DMA_LOWAT);
+	xgell_config->rx_pkt_burst = ddi_prop_get_int(DDI_DEV_T_ANY,
+	    dev_info, DDI_PROP_DONTPASS, "rx_pkt_burst", XGELL_RX_PKT_BURST);
+	xgell_config->tx_dma_lowat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
+	    DDI_PROP_DONTPASS, "tx_dma_lowat", XGELL_TX_DMA_LOWAT);
+	xgell_config->lso_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
+	    DDI_PROP_DONTPASS, "lso_enable", XGELL_CONF_ENABLE_BY_DEFAULT);
+	xgell_config->msix_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
+	    DDI_PROP_DONTPASS, "msix_enable", XGELL_CONF_ENABLE_BY_DEFAULT);
+
+	xgell_config->grouping = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
+	    DDI_PROP_DONTPASS, "grouping", XGELL_CONF_GROUP_POLICY_DEFAULT);
+
+	switch (xgell_config->grouping) {
+	case XGELL_CONF_GROUP_POLICY_VIRT:
+		/*
+		 * Enable layer 2 steering for better virtualization
+		 */
+		device_config->rth_en = XGE_HAL_RTH_DISABLE;
+		device_config->rts_mac_en = XGE_HAL_RTS_MAC_ENABLE;
+		break;
+	case XGELL_CONF_GROUP_POLICY_PERF:
+		/*
+		 * Configure layer 4 RTH to hashing inbound traffic
+		 */
+		device_config->rth_en = XGE_HAL_RTH_ENABLE;
+		device_config->rth_bucket_size = XGE_HAL_MAX_RTH_BUCKET_SIZE;
+		device_config->rth_spdm_en = XGE_HAL_RTH_SPDM_DISABLE;
+		device_config->rth_spdm_use_l4 = XGE_HAL_RTH_SPDM_USE_L4;
+
+		device_config->rts_mac_en = XGE_HAL_RTS_MAC_DISABLE;
+		break;
+	case XGELL_CONF_GROUP_POLICY_BASIC:
+	default:
+		/*
+		 * Disable both RTS and RTH for single ring configuration
+		 */
+		device_config->rth_en = XGE_HAL_RTH_DISABLE;
+		device_config->rts_mac_en = XGE_HAL_RTS_MAC_DISABLE;
+		break;
+	}
+
+	/*
 	 * Initialize common properties
 	 */
 	device_config->mtu = ddi_prop_get_int(DDI_DEV_T_ANY,
@@ -634,12 +641,6 @@ xge_configuration_init(dev_info_t *dev_info,
 	    XGE_HAL_DEFAULT_BIMODAL_TIMER_HI_US);
 
 	/*
-	 * MSI-X switch
-	 */
-	ll_config->msix_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
-	    DDI_PROP_DONTPASS, "msix_enable", XGELL_CONF_ENABLE_BY_DEFAULT);
-
-	/*
 	 * Go through all possibly configured rings. Each ring could be
 	 * configured individually. To enable/disable specific ring, just
 	 * set ring->configured = [1|0].
@@ -740,30 +741,20 @@ xge_configuration_init(dev_info_t *dev_info,
 	    XGE_HAL_DEFAULT_LRO_FRM_LEN);
 
 	/*
-	 * Initialize link layer configuration
+	 * Initialize other link layer configuration first
 	 */
-	ll_config->rx_buffer_total = ddi_prop_get_int(DDI_DEV_T_ANY,
+	xgell_config->rx_buffer_total = ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, "rx_buffer_total",
-	    device_config->ring.queue[XGELL_RING_MAIN_QID].initial *
+	    device_config->ring.queue[XGELL_RX_RING_MAIN].initial *
 	    XGELL_RX_BUFFER_TOTAL);
-	ll_config->rx_buffer_total += XGELL_RX_BUFFER_RECYCLE_CACHE;
-	ll_config->rx_buffer_post_hiwat = ddi_prop_get_int(DDI_DEV_T_ANY,
+	xgell_config->rx_buffer_total += XGELL_RX_BUFFER_RECYCLE_CACHE;
+	xgell_config->rx_buffer_post_hiwat = ddi_prop_get_int(DDI_DEV_T_ANY,
 	    dev_info, DDI_PROP_DONTPASS, "rx_buffer_post_hiwat",
-	    device_config->ring.queue[XGELL_RING_MAIN_QID].initial *
+	    device_config->ring.queue[XGELL_RX_RING_MAIN].initial *
 	    XGELL_RX_BUFFER_POST_HIWAT);
-	ll_config->rx_buffer_post_hiwat += XGELL_RX_BUFFER_RECYCLE_CACHE;
-	ll_config->rx_pkt_burst = ddi_prop_get_int(DDI_DEV_T_ANY,
-	    dev_info, DDI_PROP_DONTPASS, "rx_pkt_burst",
-	    XGELL_RX_PKT_BURST);
-	ll_config->rx_dma_lowat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
-	    DDI_PROP_DONTPASS, "rx_dma_lowat", XGELL_RX_DMA_LOWAT);
-	ll_config->tx_dma_lowat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
-	    DDI_PROP_DONTPASS, "tx_dma_lowat", XGELL_TX_DMA_LOWAT);
-	ll_config->lso_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info,
-	    DDI_PROP_DONTPASS, "lso_enable", XGELL_CONF_ENABLE_BY_DEFAULT);
+	xgell_config->rx_buffer_post_hiwat += XGELL_RX_BUFFER_RECYCLE_CACHE;
 }
 
-
 /*
  * xge_alloc_intrs:
  *
@@ -847,6 +838,7 @@ _err_exit2:
 	}
 _err_exit1:
 	kmem_free(lldev->intr_table, lldev->intr_table_size);
+	lldev->intr_table = NULL;
 _err_exit0:
 	if (lldev->intr_type == DDI_INTR_TYPE_MSIX)
 		(void) ddi_prop_remove(DDI_DEV_T_NONE, dip, "#msix-request");
@@ -869,6 +861,7 @@ xge_free_intrs(xgelldev_t *lldev)
 		(void) ddi_intr_free(lldev->intr_table[i]);
 	}
 	kmem_free(lldev->intr_table, lldev->intr_table_size);
+	lldev->intr_table = NULL;
 
 	if (lldev->intr_type == DDI_INTR_TYPE_MSIX)
 		(void) ddi_prop_remove(DDI_DEV_T_NONE, dip, "#msix-request");
@@ -889,9 +882,10 @@ xge_add_intrs(xgelldev_t *lldev)
 	xge_hal_fifo_config_t *fifo_conf = &hal_conf->fifo;
 	xge_list_t *item;
 	int msix_idx = 1; /* 0 by default is reserved for Alarms. */
-	xge_hal_channel_t *assigned[XGELL_MAX_RING_DEFAULT +
-	    XGELL_MAX_FIFO_DEFAULT + 1];
+	xge_hal_channel_t *assigned[XGELL_RX_RING_NUM_MAX +
+	    XGELL_TX_RING_NUM_MAX + 1];
 
+	xge_assert(lldev->intr_table != NULL);
 	switch (lldev->intr_type) {
 	case DDI_INTR_TYPE_FIXED:
 		ret = ddi_intr_add_handler(lldev->intr_table[0],
@@ -1054,6 +1048,8 @@ xge_rem_intrs(xgelldev_t *lldev)
 {
 	int i;
 
+	xge_assert(lldev->intr_table != NULL);
+
 	/* Call ddi_intr_remove_handler() */
 	for (i = 0; i < lldev->intr_cnt; i++) {
 		(void) ddi_intr_remove_handler(lldev->intr_table[i]);
@@ -1079,11 +1075,11 @@ static int
 xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd)
 {
 	xgelldev_t *ll;
+	xgell_config_t *xgell_config;
 	xge_hal_device_config_t *device_config;
 	xge_hal_device_t *hldev;
 	xge_hal_device_attr_t attr;
 	xge_hal_status_e status;
-	xgell_config_t ll_config;
 	int ret, intr_types, i;
 
 	xge_debug_osdep(XGE_TRACE, "XGE_ATTACH cmd %d", cmd);
@@ -1104,10 +1100,13 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd)
 		goto _exit0;
 	}
 
+	xgell_config = kmem_zalloc(sizeof (xgell_config_t), KM_SLEEP);
 	device_config = kmem_zalloc(sizeof (xge_hal_device_config_t), KM_SLEEP);
 
-	/* Init device_config by lookup up properties from .conf file */
-	xge_configuration_init(dev_info, device_config, &ll_config);
+	/*
+	 * Initialize all configurations
+	 */
+	xge_configuration_init(dev_info, device_config, xgell_config);
 
 	/* Determine which types of interrupts supported */
 	ret = ddi_intr_get_supported_types(dev_info, &intr_types);
@@ -1161,7 +1160,34 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd)
 		goto _exit3;
 	}
 
-	if (ll_config.msix_enable && intr_types & DDI_INTR_TYPE_MSIX) {
+	/*
+	 * Init multiple rings configuration
+	 */
+	switch (xgell_config->grouping) {
+	case XGELL_CONF_GROUP_POLICY_VIRT:
+		ll->init_rx_rings = XGELL_RX_RING_NUM_MAX; /* 8 */
+		ll->init_tx_rings = XGELL_TX_RING_NUM_MAX; /* 8 */
+		ll->init_rx_groups = ll->init_rx_rings;
+		break;
+	case XGELL_CONF_GROUP_POLICY_PERF:
+		ll->init_rx_rings = XGELL_RX_RING_NUM_MAX; /* 8 */
+		ll->init_tx_rings = XGELL_TX_RING_NUM_MAX; /* 8 */
+		ll->init_rx_groups = 1;
+		break;
+	case XGELL_CONF_GROUP_POLICY_BASIC:
+		ll->init_rx_rings = XGELL_RX_RING_NUM_MIN; /* 1 */
+		ll->init_tx_rings = XGELL_TX_RING_NUM_MIN; /* 1 */
+		ll->init_rx_groups = ll->init_rx_rings;
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	/*
+	 * Init MSI-X configuration
+	 */
+	if (xgell_config->msix_enable && intr_types & DDI_INTR_TYPE_MSIX) {
 		ll->intr_type = DDI_INTR_TYPE_MSIX;
 		ll->intr_cnt = 1;
 		for (i = 0; i < XGE_HAL_MAX_FIFO_NUM; i++)
@@ -1175,9 +1201,12 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd)
 		ll->intr_cnt = 1;
 	}
 
+	/*
+	 * Allocate interrupt(s)
+	 */
 	while ((ret = xge_alloc_intrs(ll)) != DDI_SUCCESS) {
 		if (ll->intr_type == DDI_INTR_TYPE_MSIX) {
-			ll_config.msix_enable = 0;
+			xgell_config->msix_enable = 0;
 			ll->intr_type = DDI_INTR_TYPE_FIXED;
 			ll->intr_cnt = 1;
 			device_config->intr_mode = XGE_HAL_INTR_MODE_IRQLINE;
@@ -1231,7 +1260,7 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd)
 		goto _exit4;
 
 	/* allocate and register Link Layer */
-	ret = xgell_device_register(ll, &ll_config);
+	ret = xgell_device_register(ll, xgell_config);
 	if (ret != DDI_SUCCESS) {
 		goto _exit5;
 	}
@@ -1240,6 +1269,7 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd)
 	xge_hal_device_private_set(hldev, ll);
 
 	kmem_free(device_config, sizeof (xge_hal_device_config_t));
+	kmem_free(xgell_config, sizeof (xgell_config_t));
 
 	return (DDI_SUCCESS);
 
@@ -1263,6 +1293,7 @@ _exit1:
 	ddi_regs_map_free(&attr.regh0);
 _exit0a:
 	kmem_free(device_config, sizeof (xge_hal_device_config_t));
+	kmem_free(xgell_config, sizeof (xgell_config_t));
 _exit0:
 	return (ret);
 }
@@ -1298,7 +1329,7 @@ xge_quiesce(dev_info_t *dev_info)
  * This function is called by OS when the system is about
  * to shutdown or when the super user tries to unload
  * the driver. This function frees all the memory allocated
- * during xge_attch() and also unregisters the Xframe
+ * during xge_attach() and also unregisters the Xframe
  * device instance from the GLD framework.
  */
 static int
diff --git a/usr/src/uts/common/io/xge/drv/xge_osdep.h b/usr/src/uts/common/io/xge/drv/xge_osdep.h
index 18923972ee..4b09b0f983 100644
--- a/usr/src/uts/common/io/xge/drv/xge_osdep.h
+++ b/usr/src/uts/common/io/xge/drv/xge_osdep.h
@@ -37,8 +37,6 @@
 #ifndef _SYS_XGE_OSDEP_H
 #define	_SYS_XGE_OSDEP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 #include <sys/varargs.h>
diff --git a/usr/src/uts/common/io/xge/drv/xgell.c b/usr/src/uts/common/io/xge/drv/xgell.c
index 85db35ddcc..4ec1117750 100644
--- a/usr/src/uts/common/io/xge/drv/xgell.c
+++ b/usr/src/uts/common/io/xge/drv/xgell.c
@@ -24,10 +24,8 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
- *  Copyright (c) 2002-2005 Neterion, Inc.
+ *  Copyright (c) 2002-2008 Neterion, Inc.
  *  All right Reserved.
  *
  *  FileName :    xgell.c
@@ -100,9 +98,7 @@ static int		xgell_m_start(void *);
 static void		xgell_m_stop(void *);
 static int		xgell_m_promisc(void *, boolean_t);
 static int		xgell_m_multicst(void *, boolean_t, const uint8_t *);
-static int		xgell_m_unicst(void *, const uint8_t *);
 static void		xgell_m_ioctl(void *, queue_t *, mblk_t *);
-static mblk_t 		*xgell_m_tx(void *, mblk_t *);
 static boolean_t	xgell_m_getcapab(void *, mac_capab_t, void *);
 
 #define	XGELL_M_CALLBACK_FLAGS	(MC_IOCTL | MC_GETCAPAB)
@@ -114,8 +110,7 @@ static mac_callbacks_t xgell_m_callbacks = {
 	xgell_m_stop,
 	xgell_m_promisc,
 	xgell_m_multicst,
-	xgell_m_unicst,
-	xgell_m_tx,
+	NULL,
 	NULL,
 	xgell_m_ioctl,
 	xgell_m_getcapab
@@ -124,7 +119,7 @@ static mac_callbacks_t xgell_m_callbacks = {
 /*
  * xge_device_poll
  *
- * Cyclic should call me every 1s. xge_callback_event_queued should call me
+ * Timeout should call me every 1s. xge_callback_event_queued should call me
  * when HAL hope event was rescheduled.
  */
 /*ARGSUSED*/
@@ -194,32 +189,34 @@ xgell_callback_link_down(void *userdata)
  * xgell_rx_buffer_replenish_all
  *
  * To replenish all freed dtr(s) with buffers in free pool. It's called by
- * xgell_rx_buffer_recycle() or xgell_rx_1b_compl().
+ * xgell_rx_buffer_recycle() or xgell_rx_1b_callback().
  * Must be called with pool_lock held.
  */
 static void
-xgell_rx_buffer_replenish_all(xgell_ring_t *ring)
+xgell_rx_buffer_replenish_all(xgell_rx_ring_t *ring)
 {
+	xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
 	xge_hal_dtr_h dtr;
 	xgell_rx_buffer_t *rx_buffer;
 	xgell_rxd_priv_t *rxd_priv;
 
-	xge_assert(mutex_owned(&ring->bf_pool.pool_lock));
+	xge_assert(mutex_owned(&bf_pool->pool_lock));
+
+	while ((bf_pool->free > 0) &&
+	    (xge_hal_ring_dtr_reserve(ring->channelh, &dtr) == XGE_HAL_OK)) {
+		xge_assert(bf_pool->head);
 
-	while ((ring->bf_pool.free > 0) &&
-	    (xge_hal_ring_dtr_reserve(ring->channelh, &dtr) ==
-	    XGE_HAL_OK)) {
-		rx_buffer = ring->bf_pool.head;
-		ring->bf_pool.head = rx_buffer->next;
-		ring->bf_pool.free--;
+		rx_buffer = bf_pool->head;
+
+		bf_pool->head = rx_buffer->next;
+		bf_pool->free--;
 
-		xge_assert(rx_buffer);
 		xge_assert(rx_buffer->dma_addr);
 
 		rxd_priv = (xgell_rxd_priv_t *)
 		    xge_hal_ring_dtr_private(ring->channelh, dtr);
 		xge_hal_ring_dtr_1b_set(dtr, rx_buffer->dma_addr,
-		    ring->bf_pool.size);
+		    bf_pool->size);
 
 		rxd_priv->rx_buffer = rx_buffer;
 		xge_hal_ring_dtr_post(ring->channelh, dtr);
@@ -235,15 +232,16 @@ xgell_rx_buffer_replenish_all(xgell_ring_t *ring)
 static void
 xgell_rx_buffer_release(xgell_rx_buffer_t *rx_buffer)
 {
-	xgell_ring_t *ring = rx_buffer->ring;
+	xgell_rx_ring_t *ring = rx_buffer->ring;
+	xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
 
-	xge_assert(mutex_owned(&ring->bf_pool.pool_lock));
+	xge_assert(mutex_owned(&bf_pool->pool_lock));
 
 	/* Put the buffer back to pool */
-	rx_buffer->next = ring->bf_pool.head;
-	ring->bf_pool.head = rx_buffer;
+	rx_buffer->next = bf_pool->head;
+	bf_pool->head = rx_buffer;
 
-	ring->bf_pool.free++;
+	bf_pool->free++;
 }
 
 /*
@@ -266,7 +264,7 @@ static void
 xgell_rx_buffer_recycle(char *arg)
 {
 	xgell_rx_buffer_t *rx_buffer = (xgell_rx_buffer_t *)arg;
-	xgell_ring_t *ring = rx_buffer->ring;
+	xgell_rx_ring_t *ring = rx_buffer->ring;
 	xgelldev_t *lldev = ring->lldev;
 	xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
 
@@ -282,18 +280,17 @@ xgell_rx_buffer_recycle(char *arg)
 	 * Before finding a good way to set this hiwat, just always call to
 	 * replenish_all. *TODO*
 	 */
-	if ((lldev->is_initialized != 0) &&
+	if ((lldev->is_initialized != 0) && (ring->live) &&
 	    (bf_pool->recycle >= XGELL_RX_BUFFER_RECYCLE_CACHE)) {
-		if (mutex_tryenter(&bf_pool->pool_lock)) {
-			bf_pool->recycle_tail->next = bf_pool->head;
-			bf_pool->head = bf_pool->recycle_head;
-			bf_pool->recycle_head = bf_pool->recycle_tail = NULL;
-			bf_pool->post -= bf_pool->recycle;
-			bf_pool->free += bf_pool->recycle;
-			bf_pool->recycle = 0;
-			xgell_rx_buffer_replenish_all(ring);
-			mutex_exit(&bf_pool->pool_lock);
-		}
+		mutex_enter(&bf_pool->pool_lock);
+		bf_pool->recycle_tail->next = bf_pool->head;
+		bf_pool->head = bf_pool->recycle_head;
+		bf_pool->recycle_head = bf_pool->recycle_tail = NULL;
+		bf_pool->post -= bf_pool->recycle;
+		bf_pool->free += bf_pool->recycle;
+		bf_pool->recycle = 0;
+		xgell_rx_buffer_replenish_all(ring);
+		mutex_exit(&bf_pool->pool_lock);
 	}
 
 	mutex_exit(&bf_pool->recycle_lock);
@@ -306,8 +303,10 @@ xgell_rx_buffer_recycle(char *arg)
  * Return NULL if failed.
  */
 static xgell_rx_buffer_t *
-xgell_rx_buffer_alloc(xgell_ring_t *ring)
+xgell_rx_buffer_alloc(xgell_rx_ring_t *ring)
 {
+	xgelldev_t *lldev = ring->lldev;
+	xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
 	xge_hal_device_t *hldev;
 	void *vaddr;
 	ddi_dma_handle_t dma_handle;
@@ -318,7 +317,6 @@ xgell_rx_buffer_alloc(xgell_ring_t *ring)
 	size_t real_size;
 	extern ddi_device_acc_attr_t *p_xge_dev_attr;
 	xgell_rx_buffer_t *rx_buffer;
-	xgelldev_t *lldev = ring->lldev;
 
 	hldev = (xge_hal_device_t *)lldev->devh;
 
@@ -330,7 +328,7 @@ xgell_rx_buffer_alloc(xgell_ring_t *ring)
 	}
 
 	/* reserve some space at the end of the buffer for recycling */
-	if (ddi_dma_mem_alloc(dma_handle, HEADROOM + ring->bf_pool.size +
+	if (ddi_dma_mem_alloc(dma_handle, HEADROOM + bf_pool->size +
 	    sizeof (xgell_rx_buffer_t), p_xge_dev_attr, DDI_DMA_STREAMING,
 	    DDI_DMA_SLEEP, 0, (caddr_t *)&vaddr, &real_size, &dma_acch) !=
 	    DDI_SUCCESS) {
@@ -339,7 +337,7 @@ xgell_rx_buffer_alloc(xgell_ring_t *ring)
 		goto mem_failed;
 	}
 
-	if (HEADROOM + ring->bf_pool.size + sizeof (xgell_rx_buffer_t) >
+	if (HEADROOM + bf_pool->size + sizeof (xgell_rx_buffer_t) >
 	    real_size) {
 		xge_debug_ll(XGE_ERR, "%s%d: can not allocate DMA-able memory",
 		    XGELL_IFNAME, lldev->instance);
@@ -347,14 +345,14 @@ xgell_rx_buffer_alloc(xgell_ring_t *ring)
 	}
 
 	if (ddi_dma_addr_bind_handle(dma_handle, NULL, (char *)vaddr + HEADROOM,
-	    ring->bf_pool.size, DDI_DMA_READ | DDI_DMA_STREAMING,
+	    bf_pool->size, DDI_DMA_READ | DDI_DMA_STREAMING,
 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_SUCCESS) {
 		xge_debug_ll(XGE_ERR, "%s%d: out of mapping for mblk",
 		    XGELL_IFNAME, lldev->instance);
 		goto bind_failed;
 	}
 
-	if (ncookies != 1 || dma_cookie.dmac_size < ring->bf_pool.size) {
+	if (ncookies != 1 || dma_cookie.dmac_size < bf_pool->size) {
 		xge_debug_ll(XGE_ERR, "%s%d: can not handle partial DMA",
 		    XGELL_IFNAME, lldev->instance);
 		goto check_failed;
@@ -393,64 +391,77 @@ handle_failed:
  * Destroy buffer pool. If there is still any buffer hold by upper layer,
  * recorded by bf_pool.post, return DDI_FAILURE to reject to be unloaded.
  */
-static int
-xgell_rx_destroy_buffer_pool(xgell_ring_t *ring)
+static boolean_t
+xgell_rx_destroy_buffer_pool(xgell_rx_ring_t *ring)
 {
+	xgelldev_t *lldev = ring->lldev;
+	xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
 	xgell_rx_buffer_t *rx_buffer;
 	ddi_dma_handle_t  dma_handle;
 	ddi_acc_handle_t  dma_acch;
-	xgelldev_t *lldev = ring->lldev;
 	int i;
 
-	if (ring->bf_pool.recycle > 0) {
-		ring->bf_pool.recycle_tail->next = ring->bf_pool.head;
-		ring->bf_pool.head = ring->bf_pool.recycle_head;
-		ring->bf_pool.recycle_tail =
-		    ring->bf_pool.recycle_head = NULL;
-		ring->bf_pool.post -= ring->bf_pool.recycle;
-		ring->bf_pool.free += ring->bf_pool.recycle;
-		ring->bf_pool.recycle = 0;
+	/*
+	 * If the pool has been destroied, just return B_TRUE
+	 */
+	if (!bf_pool->live)
+		return (B_TRUE);
+
+	mutex_enter(&bf_pool->recycle_lock);
+	if (bf_pool->recycle > 0) {
+		mutex_enter(&bf_pool->pool_lock);
+		bf_pool->recycle_tail->next = bf_pool->head;
+		bf_pool->head = bf_pool->recycle_head;
+		bf_pool->recycle_tail = bf_pool->recycle_head = NULL;
+		bf_pool->post -= bf_pool->recycle;
+		bf_pool->free += bf_pool->recycle;
+		bf_pool->recycle = 0;
+		mutex_exit(&bf_pool->pool_lock);
 	}
+	mutex_exit(&bf_pool->recycle_lock);
 
 	/*
 	 * If there is any posted buffer, the driver should reject to be
 	 * detached. Need notice upper layer to release them.
 	 */
-	if (ring->bf_pool.post != 0) {
+	if (bf_pool->post != 0) {
 		xge_debug_ll(XGE_ERR,
 		    "%s%d has some buffers not be recycled, try later!",
 		    XGELL_IFNAME, lldev->instance);
-		return (DDI_FAILURE);
+		return (B_FALSE);
 	}
 
 	/*
-	 * Relase buffers one by one.
+	 * Release buffers one by one.
 	 */
-	for (i = ring->bf_pool.total; i > 0; i--) {
-		rx_buffer = ring->bf_pool.head;
+	for (i = bf_pool->total; i > 0; i--) {
+		rx_buffer = bf_pool->head;
 		xge_assert(rx_buffer != NULL);
 
-		ring->bf_pool.head = rx_buffer->next;
+		bf_pool->head = rx_buffer->next;
 
 		dma_handle = rx_buffer->dma_handle;
 		dma_acch = rx_buffer->dma_acch;
 
 		if (ddi_dma_unbind_handle(dma_handle) != DDI_SUCCESS) {
-			xge_debug_ll(XGE_ERR, "%s",
-			    "failed to unbind DMA handle!");
-			ring->bf_pool.head = rx_buffer;
-			return (DDI_FAILURE);
+			xge_debug_ll(XGE_ERR, "failed to unbind DMA handle!");
+			bf_pool->head = rx_buffer;
+			return (B_FALSE);
 		}
 		ddi_dma_mem_free(&dma_acch);
 		ddi_dma_free_handle(&dma_handle);
 
-		ring->bf_pool.total--;
-		ring->bf_pool.free--;
+		bf_pool->total--;
+		bf_pool->free--;
 	}
 
-	mutex_destroy(&ring->bf_pool.recycle_lock);
-	mutex_destroy(&ring->bf_pool.pool_lock);
-	return (DDI_SUCCESS);
+	xge_assert(!mutex_owned(&bf_pool->pool_lock));
+
+	mutex_destroy(&bf_pool->recycle_lock);
+	mutex_destroy(&bf_pool->pool_lock);
+	bf_pool->live = B_FALSE;
+
+	return (B_TRUE);
 }
 
 /*
@@ -458,29 +469,34 @@ xgell_rx_destroy_buffer_pool(xgell_ring_t *ring)
  *
  * Initialize RX buffer pool for all RX rings. Refer to rx_buffer_pool_t.
  */
-static int
-xgell_rx_create_buffer_pool(xgell_ring_t *ring)
+static boolean_t
+xgell_rx_create_buffer_pool(xgell_rx_ring_t *ring)
 {
+	xgelldev_t *lldev = ring->lldev;
+	xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
 	xge_hal_device_t *hldev;
 	xgell_rx_buffer_t *rx_buffer;
-	xgelldev_t *lldev = ring->lldev;
 	int i;
 
+	if (bf_pool->live)
+		return (B_TRUE);
+
 	hldev = (xge_hal_device_t *)lldev->devh;
 
-	ring->bf_pool.total = 0;
-	ring->bf_pool.size = XGELL_MAX_FRAME_SIZE(hldev);
-	ring->bf_pool.head = NULL;
-	ring->bf_pool.free = 0;
-	ring->bf_pool.post = 0;
-	ring->bf_pool.post_hiwat = lldev->config.rx_buffer_post_hiwat;
-	ring->bf_pool.recycle = 0;
-	ring->bf_pool.recycle_head = NULL;
-	ring->bf_pool.recycle_tail = NULL;
-
-	mutex_init(&ring->bf_pool.pool_lock, NULL, MUTEX_DRIVER,
+	bf_pool->total = 0;
+	bf_pool->size = XGELL_MAX_FRAME_SIZE(hldev);
+	bf_pool->head = NULL;
+	bf_pool->free = 0;
+	bf_pool->post = 0;
+	bf_pool->post_hiwat = lldev->config.rx_buffer_post_hiwat;
+	bf_pool->recycle = 0;
+	bf_pool->recycle_head = NULL;
+	bf_pool->recycle_tail = NULL;
+	bf_pool->live = B_TRUE;
+
+	mutex_init(&bf_pool->pool_lock, NULL, MUTEX_DRIVER,
 	    DDI_INTR_PRI(hldev->irqh));
-	mutex_init(&ring->bf_pool.recycle_lock, NULL, MUTEX_DRIVER,
+	mutex_init(&bf_pool->recycle_lock, NULL, MUTEX_DRIVER,
 	    DDI_INTR_PRI(hldev->irqh));
 
 	/*
@@ -491,17 +507,17 @@ xgell_rx_create_buffer_pool(xgell_ring_t *ring)
 	for (i = 0; i < lldev->config.rx_buffer_total; i++) {
 		if ((rx_buffer = xgell_rx_buffer_alloc(ring)) == NULL) {
 			(void) xgell_rx_destroy_buffer_pool(ring);
-			return (DDI_FAILURE);
+			return (B_FALSE);
 		}
 
-		rx_buffer->next = ring->bf_pool.head;
-		ring->bf_pool.head = rx_buffer;
+		rx_buffer->next = bf_pool->head;
+		bf_pool->head = rx_buffer;
 
-		ring->bf_pool.total++;
-		ring->bf_pool.free++;
+		bf_pool->total++;
+		bf_pool->free++;
 	}
 
-	return (DDI_SUCCESS);
+	return (B_TRUE);
 }
 
 /*
@@ -514,23 +530,26 @@ xge_hal_status_e
 xgell_rx_dtr_replenish(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, int index,
     void *userdata, xge_hal_channel_reopen_e reopen)
 {
-	xgell_ring_t *ring = userdata;
+	xgell_rx_ring_t *ring = userdata;
+	xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool;
 	xgell_rx_buffer_t *rx_buffer;
 	xgell_rxd_priv_t *rxd_priv;
 
-	if (ring->bf_pool.head == NULL) {
-		xge_debug_ll(XGE_ERR, "%s", "no more available rx DMA buffer!");
+	mutex_enter(&bf_pool->pool_lock);
+	if (bf_pool->head == NULL) {
+		xge_debug_ll(XGE_ERR, "no more available rx DMA buffer!");
 		return (XGE_HAL_FAIL);
 	}
-	rx_buffer = ring->bf_pool.head;
-	ring->bf_pool.head = rx_buffer->next;
-	ring->bf_pool.free--;
-
+	rx_buffer = bf_pool->head;
 	xge_assert(rx_buffer);
 	xge_assert(rx_buffer->dma_addr);
 
+	bf_pool->head = rx_buffer->next;
+	bf_pool->free--;
+	mutex_exit(&bf_pool->pool_lock);
+
 	rxd_priv = (xgell_rxd_priv_t *)xge_hal_ring_dtr_private(channelh, dtr);
-	xge_hal_ring_dtr_1b_set(dtr, rx_buffer->dma_addr, ring->bf_pool.size);
+	xge_hal_ring_dtr_1b_set(dtr, rx_buffer->dma_addr, bf_pool->size);
 
 	rxd_priv->rx_buffer = rx_buffer;
 
@@ -637,9 +656,10 @@ xgell_rx_hcksum_assoc(mblk_t *mp, char *vaddr, int pkt_length,
  * new message and copy the payload in.
  */
 static mblk_t *
-xgell_rx_1b_msg_alloc(xgelldev_t *lldev, xgell_rx_buffer_t *rx_buffer,
+xgell_rx_1b_msg_alloc(xgell_rx_ring_t *ring, xgell_rx_buffer_t *rx_buffer,
     int pkt_length, xge_hal_dtr_info_t *ext_info, boolean_t *copyit)
 {
+	xgelldev_t *lldev = ring->lldev;
 	mblk_t *mp;
 	char *vaddr;
 
@@ -676,24 +696,25 @@ xgell_rx_1b_msg_alloc(xgelldev_t *lldev, xgell_rx_buffer_t *rx_buffer,
 }
 
 /*
- * xgell_rx_1b_compl
+ * xgell_rx_1b_callback
  *
  * If the interrupt is because of a received frame or if the receive ring
  * contains fresh as yet un-processed frames, this function is called.
  */
 static xge_hal_status_e
-xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
+xgell_rx_1b_callback(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
     void *userdata)
 {
-	xgell_ring_t *ring = (xgell_ring_t *)userdata;
+	xgell_rx_ring_t *ring = (xgell_rx_ring_t *)userdata;
 	xgelldev_t *lldev = ring->lldev;
 	xgell_rx_buffer_t *rx_buffer;
 	mblk_t *mp_head = NULL;
 	mblk_t *mp_end  = NULL;
 	int pkt_burst = 0;
 
-	mutex_enter(&ring->bf_pool.pool_lock);
+	xge_debug_ll(XGE_TRACE, "xgell_rx_1b_callback on ring %d", ring->index);
 
+	mutex_enter(&ring->bf_pool.pool_lock);
 	do {
 		int pkt_length;
 		dma_addr_t dma_data;
@@ -744,7 +765,7 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
 			copyit = B_FALSE;
 		}
 
-		mp = xgell_rx_1b_msg_alloc(lldev, rx_buffer, pkt_length,
+		mp = xgell_rx_1b_msg_alloc(ring, rx_buffer, pkt_length,
 		    &ext_info, &copyit);
 
 		xge_hal_ring_dtr_free(channelh, dtr);
@@ -771,8 +792,10 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
 		 * Associate cksum_flags per packet type and h/w
 		 * cksum flags.
 		 */
-		xgell_rx_hcksum_assoc(mp, (char *)rx_buffer->vaddr +
-		    HEADROOM, pkt_length, &ext_info);
+		xgell_rx_hcksum_assoc(mp, (char *)rx_buffer->vaddr + HEADROOM,
+		    pkt_length, &ext_info);
+
+		ring->received_bytes += pkt_length;
 
 		if (mp_head == NULL) {
 			mp_head = mp;
@@ -782,6 +805,26 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
 			mp_end = mp;
 		}
 
+		/*
+		 * Inlined implemented polling function.
+		 */
+		if ((ring->poll_mp == NULL) && (ring->poll_bytes > 0)) {
+			ring->poll_mp = mp_head;
+		}
+		if (ring->poll_mp != NULL) {
+			if ((ring->poll_bytes -= pkt_length) <= 0) {
+				/* have polled enough packets. */
+				break;
+			} else {
+				/* continue polling packets. */
+				continue;
+			}
+		}
+
+		/*
+		 * We're not in polling mode, so try to chain more messages
+		 * or send the chain up according to pkt_burst.
+		 */
 		if (++pkt_burst < lldev->config.rx_pkt_burst)
 			continue;
 
@@ -791,8 +834,8 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
 		}
 		mutex_exit(&ring->bf_pool.pool_lock);
 		if (mp_head != NULL) {
-			mac_rx(lldev->mh, ((xgell_ring_t *)userdata)->handle,
-			    mp_head);
+			mac_rx_ring(lldev->mh, ring->ring_handle, mp_head,
+			    ring->ring_gen_num);
 		}
 		mp_head = mp_end  = NULL;
 		pkt_burst = 0;
@@ -807,13 +850,39 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
 	xgell_rx_buffer_replenish_all(ring);
 	mutex_exit(&ring->bf_pool.pool_lock);
 
-	if (mp_head != NULL) {
-		mac_rx(lldev->mh, ((xgell_ring_t *)userdata)->handle, mp_head);
+	/*
+	 * If we're not in polling cycle, call mac_rx(), otherwise
+	 * just return while leaving packets chained to ring->poll_mp.
+	 */
+	if ((ring->poll_mp == NULL) && (mp_head != NULL)) {
+		mac_rx_ring(lldev->mh, ring->ring_handle, mp_head,
+		    ring->ring_gen_num);
 	}
 
 	return (XGE_HAL_OK);
 }
 
+mblk_t *
+xgell_rx_poll(void *arg, int bytes_to_pickup)
+{
+	xgell_rx_ring_t *ring = (xgell_rx_ring_t *)arg;
+	int got_rx = 0;
+	mblk_t *mp;
+
+	xge_debug_ll(XGE_TRACE, "xgell_rx_poll on ring %d", ring->index);
+
+	ring->poll_mp = NULL;
+	ring->poll_bytes = bytes_to_pickup;
+	(void) xge_hal_device_poll_rx_channel(ring->channelh, &got_rx);
+
+	mp = ring->poll_mp;
+	ring->poll_bytes = -1;
+	ring->polled_bytes += got_rx;
+	ring->poll_mp = NULL;
+
+	return (mp);
+}
+
 /*
  * xgell_xmit_compl
  *
@@ -826,8 +895,8 @@ static xge_hal_status_e
 xgell_xmit_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
     void *userdata)
 {
-	xgell_fifo_t *fifo = (xgell_fifo_t *)userdata;
-	xgelldev_t *lldev = fifo->lldev;
+	xgell_tx_ring_t *ring = userdata;
+	xgelldev_t *lldev = ring->lldev;
 
 	do {
 		xgell_txd_priv_t *txd_priv = ((xgell_txd_priv_t *)
@@ -861,58 +930,36 @@ xgell_xmit_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code,
 			txd_priv->mblk = NULL;
 		}
 
-		lldev->resched_avail++;
-
 	} while (xge_hal_fifo_dtr_next_completed(channelh, &dtr, &t_code) ==
 	    XGE_HAL_OK);
 
-	if (lldev->resched_retry &&
-	    xge_queue_produce_context(xge_hal_device_queue(lldev->devh),
-	    XGELL_EVENT_RESCHED_NEEDED, fifo) == XGE_QUEUE_OK) {
-		xge_debug_ll(XGE_TRACE, "%s%d: IRQ produced event for queue %d",
-		    XGELL_IFNAME, lldev->instance,
-		    ((xge_hal_channel_t *)channelh)->post_qid);
-		lldev->resched_send = lldev->resched_avail;
-		lldev->resched_retry = 0;
-	}
+	if (ring->need_resched)
+		mac_tx_ring_update(lldev->mh, ring->ring_handle);
 
 	return (XGE_HAL_OK);
 }
 
-/*
- * xgell_send
- * @hldev: pointer to xge_hal_device_t strucutre
- * @mblk: pointer to network buffer, i.e. mblk_t structure
- *
- * Called by the xgell_m_tx to transmit the packet to the XFRAME firmware.
- * A pointer to an M_DATA message that contains the packet is passed to
- * this routine.
- */
-static boolean_t
-xgell_send(xgelldev_t *lldev, mblk_t *mp)
+mblk_t *
+xgell_ring_tx(void *arg, mblk_t *mp)
 {
+	xgell_tx_ring_t *ring = (xgell_tx_ring_t *)arg;
 	mblk_t *bp;
-	boolean_t retry;
+	xgelldev_t *lldev = ring->lldev;
 	xge_hal_device_t *hldev = lldev->devh;
 	xge_hal_status_e status;
 	xge_hal_dtr_h dtr;
 	xgell_txd_priv_t *txd_priv;
 	uint32_t hckflags;
+	uint32_t lsoflags;
 	uint32_t mss;
 	int handle_cnt, frag_cnt, ret, i, copied;
 	boolean_t used_copy;
-	xgell_fifo_t *fifo;
-	xge_hal_channel_h fifo_channel;
 
 _begin:
-	retry = B_FALSE;
 	handle_cnt = frag_cnt = 0;
 
 	if (!lldev->is_initialized || lldev->in_reset)
-		return (B_FALSE);
-
-	fifo = &lldev->fifos[0];
-	fifo_channel = fifo->channelh;
+		return (mp);
 
 	/*
 	 * If the free Tx dtrs count reaches the lower threshold,
@@ -921,23 +968,17 @@ _begin:
 	 * gld through gld_sched call, when the free dtrs count exceeds
 	 * the higher threshold.
 	 */
-	if (xge_hal_channel_dtr_count(fifo_channel)
+	if (xge_hal_channel_dtr_count(ring->channelh)
 	    <= XGELL_TX_LEVEL_LOW) {
-		if (++fifo->level_low > XGELL_TX_LEVEL_CHECK) {
-			xge_debug_ll(XGE_TRACE, "%s%d: queue %d: err on xmit,"
-			    "free descriptors count at low threshold %d",
-			    XGELL_IFNAME, lldev->instance,
-			    ((xge_hal_channel_t *)fifo_channel)->post_qid,
-			    XGELL_TX_LEVEL_LOW);
-			fifo->level_low = 0;
-			retry = B_TRUE;
-			goto _exit;
-		}
-	} else {
-		fifo->level_low = 0;
+		xge_debug_ll(XGE_TRACE, "%s%d: queue %d: err on xmit,"
+		    "free descriptors count at low threshold %d",
+		    XGELL_IFNAME, lldev->instance,
+		    ((xge_hal_channel_t *)ring->channelh)->post_qid,
+		    XGELL_TX_LEVEL_LOW);
+		goto _exit;
 	}
 
-	status = xge_hal_fifo_dtr_reserve(fifo_channel, &dtr);
+	status = xge_hal_fifo_dtr_reserve(ring->channelh, &dtr);
 	if (status != XGE_HAL_OK) {
 		switch (status) {
 		case XGE_HAL_INF_CHANNEL_IS_NOT_READY:
@@ -945,19 +986,17 @@ _begin:
 			    "%s%d: channel %d is not ready.", XGELL_IFNAME,
 			    lldev->instance,
 			    ((xge_hal_channel_t *)
-			    fifo_channel)->post_qid);
-			retry = B_TRUE;
+			    ring->channelh)->post_qid);
 			goto _exit;
 		case XGE_HAL_INF_OUT_OF_DESCRIPTORS:
 			xge_debug_ll(XGE_TRACE, "%s%d: queue %d: error in xmit,"
 			    " out of descriptors.", XGELL_IFNAME,
 			    lldev->instance,
 			    ((xge_hal_channel_t *)
-			    fifo_channel)->post_qid);
-			retry = B_TRUE;
+			    ring->channelh)->post_qid);
 			goto _exit;
 		default:
-			return (B_FALSE);
+			return (mp);
 		}
 	}
 
@@ -1002,6 +1041,8 @@ _begin:
 			continue;
 		}
 
+		ring->sent_bytes += mblen;
+
 		/*
 		 * Check the message length to decide to DMA or bcopy() data
 		 * to tx descriptor(s).
@@ -1009,7 +1050,7 @@ _begin:
 		if (mblen < lldev->config.tx_dma_lowat &&
 		    (copied + mblen) < lldev->tx_copied_max) {
 			xge_hal_status_e rc;
-			rc = xge_hal_fifo_dtr_buffer_append(fifo_channel,
+			rc = xge_hal_fifo_dtr_buffer_append(ring->channelh,
 			    dtr, bp->b_rptr, mblen);
 			if (rc == XGE_HAL_OK) {
 				used_copy = B_TRUE;
@@ -1017,11 +1058,11 @@ _begin:
 				continue;
 			} else if (used_copy) {
 				xge_hal_fifo_dtr_buffer_finalize(
-				    fifo_channel, dtr, frag_cnt++);
+				    ring->channelh, dtr, frag_cnt++);
 				used_copy = B_FALSE;
 			}
 		} else if (used_copy) {
-			xge_hal_fifo_dtr_buffer_finalize(fifo_channel,
+			xge_hal_fifo_dtr_buffer_finalize(ring->channelh,
 			    dtr, frag_cnt++);
 			used_copy = B_FALSE;
 		}
@@ -1075,7 +1116,7 @@ _begin:
 
 		/* setup the descriptors for this data buffer */
 		while (ncookies) {
-			xge_hal_fifo_dtr_buffer_set(fifo_channel, dtr,
+			xge_hal_fifo_dtr_buffer_set(ring->channelh, dtr,
 			    frag_cnt++, dma_cookie.dmac_laddress,
 			    dma_cookie.dmac_size);
 			if (--ncookies) {
@@ -1108,7 +1149,7 @@ _begin:
 
 	/* finalize unfinished copies */
 	if (used_copy) {
-		xge_hal_fifo_dtr_buffer_finalize(fifo_channel, dtr,
+		xge_hal_fifo_dtr_buffer_finalize(ring->channelh, dtr,
 		    frag_cnt++);
 	}
 
@@ -1118,11 +1159,14 @@ _begin:
 	 * If LSO is required, just call xge_hal_fifo_dtr_mss_set(dtr, mss) to
 	 * do all necessary work.
 	 */
-	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, &mss, &hckflags);
-	if ((hckflags & HW_LSO) && (mss != 0)) {
+	lso_info_get(mp, &mss, &lsoflags);
+
+	if (lsoflags & HW_LSO) {
+		xge_assert((mss != 0) && (mss <= XGE_HAL_DEFAULT_MTU));
 		xge_hal_fifo_dtr_mss_set(dtr, mss);
 	}
 
+	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
 	if (hckflags & HCK_IPV4_HDRCKSUM) {
 		xge_hal_fifo_dtr_cksum_set_bits(dtr,
 		    XGE_HAL_TXD_TX_CKO_IPV4_EN);
@@ -1132,63 +1176,376 @@ _begin:
 		    XGE_HAL_TXD_TX_CKO_UDP_EN);
 	}
 
-	xge_hal_fifo_dtr_post(fifo_channel, dtr);
+	xge_hal_fifo_dtr_post(ring->channelh, dtr);
 
-	return (B_TRUE);
+	return (NULL);
 
 _exit_cleanup:
-
+	/*
+	 * Could not successfully transmit but have changed the message,
+	 * so just free it and return NULL
+	 */
 	for (i = 0; i < handle_cnt; i++) {
 		(void) ddi_dma_unbind_handle(txd_priv->dma_handles[i]);
 		ddi_dma_free_handle(&txd_priv->dma_handles[i]);
 		txd_priv->dma_handles[i] = 0;
 	}
 
-	xge_hal_fifo_dtr_free(fifo_channel, dtr);
+	xge_hal_fifo_dtr_free(ring->channelh, dtr);
+
+	freemsg(mp);
+	return (NULL);
 
 _exit:
-	if (retry) {
-		if (lldev->resched_avail != lldev->resched_send &&
-		    xge_queue_produce_context(xge_hal_device_queue(lldev->devh),
-		    XGELL_EVENT_RESCHED_NEEDED, fifo) == XGE_QUEUE_OK) {
-			lldev->resched_send = lldev->resched_avail;
-			return (B_FALSE);
-		} else {
-			lldev->resched_retry = 1;
+	ring->need_resched = B_TRUE;
+	return (mp);
+}
+
+/*
+ * xgell_ring_macaddr_init
+ */
+static void
+xgell_rx_ring_maddr_init(xgell_rx_ring_t *ring)
+{
+	int i;
+	xgelldev_t *lldev = ring->lldev;
+	xge_hal_device_t *hldev = lldev->devh;
+	int slot_start;
+
+	xge_debug_ll(XGE_TRACE, "%s", "xgell_rx_ring_maddr_init");
+
+	ring->mmac.naddr = XGE_RX_MULTI_MAC_ADDRESSES_MAX;
+	ring->mmac.naddrfree = ring->mmac.naddr;
+
+	/*
+	 * For the default rx ring, the first MAC address is the factory one.
+	 * This will be set by the framework, so need to clear it for now.
+	 */
+	(void) xge_hal_device_macaddr_clear(hldev, 0);
+
+	/*
+	 * Read the MAC address Configuration Memory from HAL.
+	 * The first slot will hold a factory MAC address, contents in other
+	 * slots will be FF:FF:FF:FF:FF:FF.
+	 */
+	slot_start = ring->index * 32;
+	for (i = 0; i < ring->mmac.naddr; i++) {
+		(void) xge_hal_device_macaddr_get(hldev, slot_start + i,
+		    ring->mmac.mac_addr + i);
+		ring->mmac.mac_addr_set[i] = B_FALSE;
+	}
+}
+
+static int xgell_maddr_set(xgelldev_t *, int, uint8_t *);
+
+static int
+xgell_addmac(void *arg, const uint8_t *mac_addr)
+{
+	xgell_rx_ring_t *ring = arg;
+	xgelldev_t *lldev = ring->lldev;
+	xge_hal_device_t *hldev = lldev->devh;
+	int slot;
+	int slot_start;
+
+	xge_debug_ll(XGE_TRACE, "%s", "xgell_addmac");
+
+	mutex_enter(&lldev->genlock);
+
+	if (ring->mmac.naddrfree == 0) {
+		mutex_exit(&lldev->genlock);
+		return (ENOSPC);
+	}
+
+	/* First slot is for factory MAC address */
+	for (slot = 0; slot < ring->mmac.naddr; slot++) {
+		if (ring->mmac.mac_addr_set[slot] == B_FALSE) {
+			break;
 		}
 	}
 
-	if (mp)
-		freemsg(mp);
-	return (B_TRUE);
+	ASSERT(slot < ring->mmac.naddr);
+
+	slot_start = ring->index * 32;
+
+	if (xgell_maddr_set(lldev, slot_start + slot, (uint8_t *)mac_addr) !=
+	    0) {
+		mutex_exit(&lldev->genlock);
+		return (EIO);
+	}
+
+	/* Simply enable RTS for the whole section. */
+	(void) xge_hal_device_rts_section_enable(hldev, slot_start + slot);
+
+	/*
+	 * Read back the MAC address from HAL to keep the array up to date.
+	 */
+	if (xge_hal_device_macaddr_get(hldev, slot_start + slot,
+	    ring->mmac.mac_addr + slot) != XGE_HAL_OK) {
+		(void) xge_hal_device_macaddr_clear(hldev, slot_start + slot);
+		return (EIO);
+	}
+
+	ring->mmac.mac_addr_set[slot] = B_TRUE;
+	ring->mmac.naddrfree--;
+
+	mutex_exit(&lldev->genlock);
+
+	return (0);
+}
+
+static int
+xgell_remmac(void *arg, const uint8_t *mac_addr)
+{
+	xgell_rx_ring_t *ring = arg;
+	xgelldev_t *lldev = ring->lldev;
+	xge_hal_device_t *hldev = lldev->devh;
+	xge_hal_status_e status;
+	int slot;
+	int slot_start;
+
+	xge_debug_ll(XGE_TRACE, "%s", "xgell_remmac");
+
+	slot = xge_hal_device_macaddr_find(hldev, (uint8_t *)mac_addr);
+	if (slot == -1)
+		return (EINVAL);
+
+	slot_start = ring->index * 32;
+
+	/*
+	 * Adjust slot to the offset in the MAC array of this ring (group).
+	 */
+	slot -= slot_start;
+
+	/*
+	 * Only can remove a pre-set MAC address for this ring (group).
+	 */
+	if (slot < 0 || slot >= ring->mmac.naddr)
+		return (EINVAL);
+
+
+	xge_assert(ring->mmac.mac_addr_set[slot]);
+
+	mutex_enter(&lldev->genlock);
+	if (!ring->mmac.mac_addr_set[slot]) {
+		mutex_exit(&lldev->genlock);
+		/*
+		 * The result will be unexpected when reach here. WARNING!
+		 */
+		xge_debug_ll(XGE_ERR,
+		    "%s%d: caller is trying to remove an unset MAC address",
+		    XGELL_IFNAME, lldev->instance);
+		return (ENXIO);
+	}
+
+	status = xge_hal_device_macaddr_clear(hldev, slot_start + slot);
+	if (status != XGE_HAL_OK) {
+		mutex_exit(&lldev->genlock);
+		return (EIO);
+	}
+
+	ring->mmac.mac_addr_set[slot] = B_FALSE;
+	ring->mmac.naddrfree++;
+
+	/*
+	 * TODO: Disable MAC RTS if all addresses have been cleared.
+	 */
+
+	/*
+	 * Read back the MAC address from HAL to keep the array up to date.
+	 */
+	(void) xge_hal_device_macaddr_get(hldev, slot_start + slot,
+	    ring->mmac.mac_addr + slot);
+	mutex_exit(&lldev->genlock);
+
+	return (0);
 }
 
 /*
- * xge_m_tx
- * @arg: pointer to the xgelldev_t structure
- * @resid: resource id
- * @mp: pointer to the message buffer
+ * Temporarily calling hal function.
  *
- * Called by MAC Layer to send a chain of packets
+ * With MSI-X implementation, no lock is needed, so that the interrupt
+ * handling could be faster.
  */
-static mblk_t *
-xgell_m_tx(void *arg, mblk_t *mp)
+int
+xgell_rx_ring_intr_enable(mac_intr_handle_t ih)
 {
-	xgelldev_t *lldev = arg;
-	mblk_t *next;
+	xgell_rx_ring_t *ring = (xgell_rx_ring_t *)ih;
 
-	while (mp != NULL) {
-		next = mp->b_next;
-		mp->b_next = NULL;
+	mutex_enter(&ring->ring_lock);
+	xge_hal_device_rx_channel_disable_polling(ring->channelh);
+	mutex_exit(&ring->ring_lock);
 
-		if (!xgell_send(lldev, mp)) {
-			mp->b_next = next;
-			break;
-		}
-		mp = next;
+	return (0);
+}
+
+int
+xgell_rx_ring_intr_disable(mac_intr_handle_t ih)
+{
+	xgell_rx_ring_t *ring = (xgell_rx_ring_t *)ih;
+
+	mutex_enter(&ring->ring_lock);
+	xge_hal_device_rx_channel_enable_polling(ring->channelh);
+	mutex_exit(&ring->ring_lock);
+
+	return (0);
+}
+
+static int
+xgell_rx_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
+{
+	xgell_rx_ring_t *rx_ring = (xgell_rx_ring_t *)rh;
+
+	rx_ring->ring_gen_num = mr_gen_num;
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+xgell_rx_ring_stop(mac_ring_driver_t rh)
+{
+}
+
+/*ARGSUSED*/
+static int
+xgell_tx_ring_start(mac_ring_driver_t rh, uint64_t useless)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+xgell_tx_ring_stop(mac_ring_driver_t rh)
+{
+}
+
+/*
+ * Callback funtion for MAC layer to register all rings.
+ *
+ * Xframe hardware doesn't support grouping explicitly, so the driver needs
+ * to pretend having resource groups. We may also optionally group all 8 rx
+ * rings into a single group for increased scalability on CMT architectures,
+ * or group one rx ring per group for maximum virtualization.
+ *
+ * TX grouping is actually done by framework, so, just register all TX
+ * resources without grouping them.
+ */
+void
+xgell_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
+    const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+	xgelldev_t *lldev = (xgelldev_t *)arg;
+	mac_intr_t *mintr;
+
+	switch (rtype) {
+	case MAC_RING_TYPE_RX: {
+		xgell_rx_ring_t *rx_ring;
+
+		xge_assert(index < lldev->init_rx_rings);
+		xge_assert(rg_index < lldev->init_rx_groups);
+
+		/*
+		 * Performance vs. Virtualization
+		 */
+		if (lldev->init_rx_rings == lldev->init_rx_groups)
+			rx_ring = lldev->rx_ring + rg_index;
+		else
+			rx_ring = lldev->rx_ring + index;
+
+		rx_ring->ring_handle = rh;
+
+		infop->mri_driver = (mac_ring_driver_t)rx_ring;
+		infop->mri_start = xgell_rx_ring_start;
+		infop->mri_stop = xgell_rx_ring_stop;
+		infop->mri_poll = xgell_rx_poll;
+
+		mintr = &infop->mri_intr;
+		mintr->mi_handle = (mac_intr_handle_t)rx_ring;
+		mintr->mi_enable = xgell_rx_ring_intr_enable;
+		mintr->mi_disable = xgell_rx_ring_intr_disable;
+
+		break;
 	}
+	case MAC_RING_TYPE_TX: {
+		xgell_tx_ring_t *tx_ring;
 
-	return (mp);
+		xge_assert(rg_index == -1);
+
+		xge_assert((index >= 0) && (index < lldev->init_tx_rings));
+
+		tx_ring = lldev->tx_ring + index;
+		tx_ring->ring_handle = rh;
+
+		infop->mri_driver = (mac_ring_driver_t)tx_ring;
+		infop->mri_start = xgell_tx_ring_start;
+		infop->mri_stop = xgell_tx_ring_stop;
+		infop->mri_tx = xgell_ring_tx;
+
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+void
+xgell_fill_group(void *arg, mac_ring_type_t rtype, const int index,
+    mac_group_info_t *infop, mac_group_handle_t gh)
+{
+	xgelldev_t *lldev = (xgelldev_t *)arg;
+
+	switch (rtype) {
+	case MAC_RING_TYPE_RX: {
+		xgell_rx_ring_t *rx_ring;
+
+		xge_assert(index < lldev->init_rx_groups);
+
+		rx_ring = lldev->rx_ring + index;
+
+		rx_ring->group_handle = gh;
+
+		infop->mgi_driver = (mac_group_driver_t)rx_ring;
+		infop->mgi_start = NULL;
+		infop->mgi_stop = NULL;
+		infop->mgi_addmac = xgell_addmac;
+		infop->mgi_remmac = xgell_remmac;
+		infop->mgi_count = lldev->init_rx_rings / lldev->init_rx_groups;
+
+		break;
+	}
+	case MAC_RING_TYPE_TX:
+		xge_assert(0);
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * xgell_macaddr_set
+ */
+static int
+xgell_maddr_set(xgelldev_t *lldev, int index, uint8_t *macaddr)
+{
+	xge_hal_device_t *hldev = lldev->devh;
+	xge_hal_status_e status;
+
+	xge_debug_ll(XGE_TRACE, "%s", "xgell_maddr_set");
+
+	xge_debug_ll(XGE_TRACE,
+	    "setting macaddr: 0x%02x-%02x-%02x-%02x-%02x-%02x",
+	    macaddr[0], macaddr[1], macaddr[2],
+	    macaddr[3], macaddr[4], macaddr[5]);
+
+	status = xge_hal_device_macaddr_set(hldev, index, (uchar_t *)macaddr);
+
+	if (status != XGE_HAL_OK) {
+		xge_debug_ll(XGE_ERR, "%s%d: can not set mac address",
+		    XGELL_IFNAME, lldev->instance);
+		return (EIO);
+	}
+
+	return (0);
 }
 
 /*
@@ -1201,12 +1558,13 @@ static void
 xgell_rx_dtr_term(xge_hal_channel_h channelh, xge_hal_dtr_h dtrh,
     xge_hal_dtr_state_e state, void *userdata, xge_hal_channel_reopen_e reopen)
 {
-	xgell_ring_t *ring = (xgell_ring_t *)userdata;
 	xgell_rxd_priv_t *rxd_priv =
 	    ((xgell_rxd_priv_t *)xge_hal_ring_dtr_private(channelh, dtrh));
 	xgell_rx_buffer_t *rx_buffer = rxd_priv->rx_buffer;
 
 	if (state == XGE_HAL_DTR_STATE_POSTED) {
+		xgell_rx_ring_t *ring = rx_buffer->ring;
+
 		mutex_enter(&ring->bf_pool.pool_lock);
 		xge_hal_ring_dtr_free(channelh, dtrh);
 		xgell_rx_buffer_release(rx_buffer);
@@ -1215,6 +1573,137 @@ xgell_rx_dtr_term(xge_hal_channel_h channelh, xge_hal_dtr_h dtrh,
 }
 
 /*
+ * To open a rx ring.
+ */
+static boolean_t
+xgell_rx_ring_open(xgell_rx_ring_t *rx_ring)
+{
+	xge_hal_status_e status;
+	xge_hal_channel_attr_t attr;
+	xgelldev_t *lldev = rx_ring->lldev;
+	xge_hal_device_t *hldev = lldev->devh;
+
+	if (rx_ring->live)
+		return (B_TRUE);
+
+	/* Create the buffer pool first */
+	if (!xgell_rx_create_buffer_pool(rx_ring)) {
+		xge_debug_ll(XGE_ERR, "can not create buffer pool for ring: %d",
+		    rx_ring->index);
+		return (B_FALSE);
+	}
+
+	/* Default ring initialization */
+	attr.post_qid		= rx_ring->index;
+	attr.compl_qid		= 0;
+	attr.callback		= xgell_rx_1b_callback;
+	attr.per_dtr_space	= sizeof (xgell_rxd_priv_t);
+	attr.flags		= 0;
+	attr.type		= XGE_HAL_CHANNEL_TYPE_RING;
+	attr.dtr_init		= xgell_rx_dtr_replenish;
+	attr.dtr_term		= xgell_rx_dtr_term;
+	attr.userdata		= rx_ring;
+
+	status = xge_hal_channel_open(lldev->devh, &attr, &rx_ring->channelh,
+	    XGE_HAL_CHANNEL_OC_NORMAL);
+	if (status != XGE_HAL_OK) {
+		xge_debug_ll(XGE_ERR, "%s%d: cannot open Rx channel got status "
+		    " code %d", XGELL_IFNAME, lldev->instance, status);
+		(void) xgell_rx_destroy_buffer_pool(rx_ring);
+		return (B_FALSE);
+	}
+
+	xgell_rx_ring_maddr_init(rx_ring);
+
+	mutex_init(&rx_ring->ring_lock, NULL, MUTEX_DRIVER,
+	    DDI_INTR_PRI(hldev->irqh));
+
+	rx_ring->received_bytes = 0;
+	rx_ring->poll_bytes = -1;
+	rx_ring->polled_bytes = 0;
+	rx_ring->poll_mp = NULL;
+	rx_ring->live = B_TRUE;
+
+	xge_debug_ll(XGE_TRACE, "RX ring [%d] is opened successfully",
+	    rx_ring->index);
+
+	return (B_TRUE);
+}
+
+static void
+xgell_rx_ring_close(xgell_rx_ring_t *rx_ring)
+{
+	if (!rx_ring->live)
+		return;
+	xge_hal_channel_close(rx_ring->channelh, XGE_HAL_CHANNEL_OC_NORMAL);
+	rx_ring->channelh = NULL;
+	/* This may not clean up all used buffers, driver will handle it */
+	if (xgell_rx_destroy_buffer_pool(rx_ring))
+		rx_ring->live = B_FALSE;
+
+	mutex_destroy(&rx_ring->ring_lock);
+}
+
+/*
+ * xgell_rx_open
+ * @lldev: the link layer object
+ *
+ * Initialize and open all RX channels.
+ */
+static boolean_t
+xgell_rx_open(xgelldev_t *lldev)
+{
+	xgell_rx_ring_t *rx_ring;
+	int i;
+
+	if (lldev->live_rx_rings != 0)
+		return (B_TRUE);
+
+	lldev->live_rx_rings = 0;
+
+	/*
+	 * Initialize all rings
+	 */
+	for (i = 0; i < lldev->init_rx_rings; i++) {
+		rx_ring = &lldev->rx_ring[i];
+		rx_ring->index = i;
+		rx_ring->lldev = lldev;
+		rx_ring->live = B_FALSE;
+
+		if (!xgell_rx_ring_open(rx_ring))
+			return (B_FALSE);
+
+		lldev->live_rx_rings++;
+	}
+
+	return (B_TRUE);
+}
+
+static void
+xgell_rx_close(xgelldev_t *lldev)
+{
+	xgell_rx_ring_t *rx_ring;
+	int i;
+
+	if (lldev->live_rx_rings == 0)
+		return;
+
+	/*
+	 * Close all rx rings
+	 */
+	for (i = 0; i < lldev->init_rx_rings; i++) {
+		rx_ring = &lldev->rx_ring[i];
+
+		if (rx_ring->live) {
+			xgell_rx_ring_close(rx_ring);
+			lldev->live_rx_rings--;
+		}
+	}
+
+	xge_assert(lldev->live_rx_rings == 0);
+}
+
+/*
  * xgell_tx_term
  *
  * Function will be called by HAL to terminate all DTRs for
@@ -1252,215 +1741,105 @@ xgell_tx_term(xge_hal_channel_h channelh, xge_hal_dtr_h dtrh,
 	}
 }
 
-/*
- * xgell_tx_close
- * @lldev: the link layer object
- *
- * Close all Tx channels
- */
-static void
-xgell_tx_close(xgelldev_t *lldev)
-{
-	xge_list_t *item, *list;
-	xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh;
-
-	list = &hldev->fifo_channels;
-	while (!xge_list_is_empty(list)) {
-		item = xge_list_first_get(list);
-		xge_hal_channel_t *channel = xge_container_of(item,
-		    xge_hal_channel_t, item);
-
-		xge_hal_channel_close(channel, XGE_HAL_CHANNEL_OC_NORMAL);
-	}
-}
-
-/*
- * xgell_tx_open
- * @lldev: the link layer object
- *
- * Initialize and open all Tx channels;
- */
 static boolean_t
-xgell_tx_open(xgelldev_t *lldev)
+xgell_tx_ring_open(xgell_tx_ring_t *tx_ring)
 {
 	xge_hal_status_e status;
-	u64 adapter_status;
 	xge_hal_channel_attr_t attr;
-	xge_list_t *item;
-	xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh;
+	xgelldev_t *lldev = tx_ring->lldev;
+
+	if (tx_ring->live)
+		return (B_TRUE);
 
-	attr.post_qid		= 0;
+	attr.post_qid		= tx_ring->index;
 	attr.compl_qid		= 0;
 	attr.callback		= xgell_xmit_compl;
 	attr.per_dtr_space	= sizeof (xgell_txd_priv_t);
 	attr.flags		= 0;
 	attr.type		= XGE_HAL_CHANNEL_TYPE_FIFO;
-	attr.userdata		= lldev;
 	attr.dtr_init		= NULL;
 	attr.dtr_term		= xgell_tx_term;
+	attr.userdata		= tx_ring;
 
-	if (xge_hal_device_status(lldev->devh, &adapter_status)) {
-		xge_debug_ll(XGE_ERR, "%s%d: device is not ready "
-		    "adaper status reads 0x%"PRIx64, XGELL_IFNAME,
-		    lldev->instance, (uint64_t)adapter_status);
+	status = xge_hal_channel_open(lldev->devh, &attr, &tx_ring->channelh,
+	    XGE_HAL_CHANNEL_OC_NORMAL);
+	if (status != XGE_HAL_OK) {
+		xge_debug_ll(XGE_ERR, "%s%d: cannot open Tx channel got status "
+		    "code %d", XGELL_IFNAME, lldev->instance, status);
 		return (B_FALSE);
 	}
 
-	/*
-	 * Open only configured channels. HAL structures are static,
-	 * so, no worries here..
-	 */
-_next_channel:
-	xge_list_for_each(item, &hldev->free_channels) {
-		xge_hal_channel_t *channel = xge_container_of(item,
-		    xge_hal_channel_t, item);
-		xgell_fifo_t *fifo;
-
-		/* filter on FIFO channels */
-		if (channel->type != XGE_HAL_CHANNEL_TYPE_FIFO)
-			continue;
-
-		fifo = &lldev->fifos[attr.post_qid];
-		fifo->lldev = lldev;
-		attr.userdata = fifo;
-
-		status = xge_hal_channel_open(lldev->devh, &attr,
-		    &fifo->channelh, XGE_HAL_CHANNEL_OC_NORMAL);
-		if (status != XGE_HAL_OK) {
-			xge_debug_ll(XGE_ERR, "%s%d: cannot open Tx channel "
-			    "got status  code %d", XGELL_IFNAME,
-			    lldev->instance, status);
-			/* unwind */
-			xgell_tx_close(lldev);
-			return (B_FALSE);
-		}
-
-		attr.post_qid++;
-
-		/*
-		 * because channel_open() moves xge_list entry
-		 * to the fifos_channels
-		 */
-		goto _next_channel;
-	}
+	tx_ring->sent_bytes = 0;
+	tx_ring->live = B_TRUE;
 
 	return (B_TRUE);
 }
 
-/*
- * xgell_rx_close
- * @lldev: the link layer object
- *
- * Close all Rx channels
- */
 static void
-xgell_rx_close(xgelldev_t *lldev)
+xgell_tx_ring_close(xgell_tx_ring_t *tx_ring)
 {
-	xge_list_t *item, *list;
-	xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh;
-
-	list = &hldev->ring_channels;
-	while (!xge_list_is_empty(list)) {
-		item = xge_list_first_get(list);
-		xge_hal_channel_t *channel = xge_container_of(item,
-		    xge_hal_channel_t, item);
-		xgell_ring_t *ring = xge_hal_channel_userdata(channel);
-
-		xge_hal_channel_close(channel, XGE_HAL_CHANNEL_OC_NORMAL);
-
-		/*
-		 * destroy Ring's buffer pool
-		 */
-		if (xgell_rx_destroy_buffer_pool(ring) != DDI_SUCCESS) {
-			xge_debug_ll(XGE_ERR, "unable to destroy Ring%d "
-			    "buffer pool", channel->post_qid);
-		}
-		list = &hldev->ring_channels;
-	}
+	if (!tx_ring->live)
+		return;
+	xge_hal_channel_close(tx_ring->channelh, XGE_HAL_CHANNEL_OC_NORMAL);
+	tx_ring->live = B_FALSE;
 }
 
 /*
- * xgell_rx_open
+ * xgell_tx_open
  * @lldev: the link layer object
  *
- * Initialize and open all Rx channels;
+ * Initialize and open all TX channels.
  */
 static boolean_t
-xgell_rx_open(xgelldev_t *lldev)
+xgell_tx_open(xgelldev_t *lldev)
 {
-	xge_hal_status_e status;
-	u64 adapter_status;
-	xge_hal_channel_attr_t attr;
-	xge_list_t *item;
-	xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh;
+	xgell_tx_ring_t *tx_ring;
+	int i;
 
-	attr.post_qid		= 0;
-	attr.compl_qid		= 0;
-	attr.callback		= xgell_rx_1b_compl;
-	attr.per_dtr_space	= sizeof (xgell_rxd_priv_t);
-	attr.flags		= 0;
-	attr.type		= XGE_HAL_CHANNEL_TYPE_RING;
-	attr.dtr_init		= xgell_rx_dtr_replenish;
-	attr.dtr_term		= xgell_rx_dtr_term;
+	if (lldev->live_tx_rings != 0)
+		return (B_TRUE);
 
-	if (xge_hal_device_status(lldev->devh, &adapter_status)) {
-		xge_debug_ll(XGE_ERR,
-		    "%s%d: device is not ready adaper status reads 0x%"PRIx64,
-		    XGELL_IFNAME, lldev->instance,
-		    (uint64_t)adapter_status);
-		return (B_FALSE);
-	}
+	lldev->live_tx_rings = 0;
 
 	/*
-	 * Open only configured channels. HAL structures are static,
-	 * so, no worries here..
+	 * Enable rings by reserve sequence to match the h/w sequences.
 	 */
-_next_channel:
-	xge_list_for_each(item, &hldev->free_channels) {
-		xge_hal_channel_t *channel = xge_container_of(item,
-		    xge_hal_channel_t, item);
-		xgell_ring_t *ring;
-
-		/* filter on RING channels */
-		if (channel->type != XGE_HAL_CHANNEL_TYPE_RING)
-			continue;
-
-		ring = &lldev->rings[attr.post_qid];
-		ring->lldev = lldev;
-		attr.userdata = ring;
-
-		if (xgell_rx_create_buffer_pool(ring) != DDI_SUCCESS) {
-			xge_debug_ll(XGE_ERR, "unable to create Ring%d "
-			    "buffer pool", attr.post_qid);
-			/* unwind */
-			xgell_rx_close(lldev);
-			return (B_FALSE);
-		}
+	for (i = 0; i < lldev->init_tx_rings; i++) {
+		tx_ring = &lldev->tx_ring[i];
+		tx_ring->index = i;
+		tx_ring->lldev = lldev;
+		tx_ring->live = B_FALSE;
 
-		status = xge_hal_channel_open(lldev->devh, &attr,
-		    &ring->channelh, XGE_HAL_CHANNEL_OC_NORMAL);
-		if (status != XGE_HAL_OK) {
-			xge_debug_ll(XGE_ERR, "%s%d: cannot open Rx channel "
-			    "got status got status code %d", XGELL_IFNAME,
-			    lldev->instance, status);
-			/* unwind */
-			(void) xgell_rx_destroy_buffer_pool(ring);
-			xgell_rx_close(lldev);
+		if (!xgell_tx_ring_open(tx_ring))
 			return (B_FALSE);
-		}
 
-		attr.post_qid++;
-
-		/*
-		 * because chhannel_open() moves xge_list entry
-		 * to the rings channels
-		 */
-		goto _next_channel;
+		lldev->live_tx_rings++;
 	}
 
 	return (B_TRUE);
 }
 
+static void
+xgell_tx_close(xgelldev_t *lldev)
+{
+	xgell_tx_ring_t *tx_ring;
+	int i;
+
+	if (lldev->live_tx_rings == 0)
+		return;
+
+	/*
+	 * Enable rings by reserve sequence to match the h/w sequences.
+	 */
+	for (i = 0; i < lldev->init_tx_rings; i++) {
+		tx_ring = &lldev->tx_ring[i];
+		if (tx_ring->live) {
+			xgell_tx_ring_close(tx_ring);
+			lldev->live_tx_rings--;
+		}
+	}
+}
+
 static int
 xgell_initiate_start(xgelldev_t *lldev)
 {
@@ -1485,13 +1864,13 @@ xgell_initiate_start(xgelldev_t *lldev)
 	}
 
 	/* tune jumbo/normal frame UFC counters */
-	hldev->config.ring.queue[XGELL_RING_MAIN_QID].rti.ufc_b = \
-	    maxpkt > XGE_HAL_DEFAULT_MTU ?
+	hldev->config.ring.queue[XGELL_RX_RING_MAIN].rti.ufc_b =
+	    (maxpkt > XGE_HAL_DEFAULT_MTU) ?
 	    XGE_HAL_DEFAULT_RX_UFC_B_J :
 	    XGE_HAL_DEFAULT_RX_UFC_B_N;
 
-	hldev->config.ring.queue[XGELL_RING_MAIN_QID].rti.ufc_c = \
-	    maxpkt > XGE_HAL_DEFAULT_MTU ?
+	hldev->config.ring.queue[XGELL_RX_RING_MAIN].rti.ufc_c =
+	    (maxpkt > XGE_HAL_DEFAULT_MTU) ?
 	    XGE_HAL_DEFAULT_RX_UFC_C_J :
 	    XGE_HAL_DEFAULT_RX_UFC_C_N;
 
@@ -1515,6 +1894,7 @@ xgell_initiate_start(xgelldev_t *lldev)
 			    XGELL_IFNAME, lldev->instance,
 			    (uint64_t)adapter_status, status);
 		}
+		xgell_rx_close(lldev);
 		xge_os_mdelay(1500);
 		return (ENOMEM);
 	}
@@ -1531,9 +1911,9 @@ xgell_initiate_start(xgelldev_t *lldev)
 			    XGELL_IFNAME, lldev->instance,
 			    (uint64_t)adapter_status, status);
 		}
-		xge_os_mdelay(1500);
+		xgell_tx_close(lldev);
 		xgell_rx_close(lldev);
-
+		xge_os_mdelay(1500);
 		return (ENOMEM);
 	}
 
@@ -1686,46 +2066,6 @@ xgell_onerr_reset(xgelldev_t *lldev)
 	return (rc);
 }
 
-
-/*
- * xgell_m_unicst
- * @arg: pointer to device private strucutre(hldev)
- * @mac_addr:
- *
- * This function is called by MAC Layer to set the physical address
- * of the XFRAME firmware.
- */
-static int
-xgell_m_unicst(void *arg, const uint8_t *macaddr)
-{
-	xge_hal_status_e status;
-	xgelldev_t *lldev = (xgelldev_t *)arg;
-	xge_hal_device_t *hldev = lldev->devh;
-	xge_debug_ll(XGE_TRACE, "%s", "MAC_UNICST");
-
-	xge_debug_ll(XGE_TRACE, "%s", "M_UNICAST");
-
-	mutex_enter(&lldev->genlock);
-
-	xge_debug_ll(XGE_TRACE,
-	    "setting macaddr: 0x%02x-%02x-%02x-%02x-%02x-%02x",
-	    macaddr[0], macaddr[1], macaddr[2],
-	    macaddr[3], macaddr[4], macaddr[5]);
-
-	status = xge_hal_device_macaddr_set(hldev, 0, (uchar_t *)macaddr);
-	if (status != XGE_HAL_OK) {
-		xge_debug_ll(XGE_ERR, "%s%d: can not set mac address",
-		    XGELL_IFNAME, lldev->instance);
-		mutex_exit(&lldev->genlock);
-		return (EIO);
-	}
-
-	mutex_exit(&lldev->genlock);
-
-	return (0);
-}
-
-
 /*
  * xgell_m_multicst
  * @arg: pointer to device private strucutre(hldev)
@@ -2039,12 +2379,14 @@ xgell_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
 	}
 }
 
-/* ARGSUSED */
+
 static boolean_t
 xgell_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 {
 	xgelldev_t *lldev = arg;
 
+	xge_debug_ll(XGE_TRACE, "xgell_m_getcapab: %x", cap);
+
 	switch (cap) {
 	case MAC_CAPAB_HCKSUM: {
 		uint32_t *hcksum_txflags = cap_data;
@@ -2063,6 +2405,29 @@ xgell_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 			return (B_FALSE);
 		}
 	}
+	case MAC_CAPAB_RINGS: {
+		mac_capab_rings_t *cap_rings = cap_data;
+
+		switch (cap_rings->mr_type) {
+		case MAC_RING_TYPE_RX:
+			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+			cap_rings->mr_rnum = lldev->init_rx_rings;
+			cap_rings->mr_gnum = lldev->init_rx_groups;
+			cap_rings->mr_rget = xgell_fill_ring;
+			cap_rings->mr_gget = xgell_fill_group;
+			break;
+		case MAC_RING_TYPE_TX:
+			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+			cap_rings->mr_rnum = lldev->init_tx_rings;
+			cap_rings->mr_gnum = 0;
+			cap_rings->mr_rget = xgell_fill_ring;
+			cap_rings->mr_gget = NULL;
+			break;
+		default:
+			break;
+		}
+		break;
+	}
 	default:
 		return (B_FALSE);
 	}
@@ -2320,8 +2685,7 @@ xgell_devconfig_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
 		return (ENOSPC);
 	}
 	status = xge_hal_aux_device_config_read(lldev->devh,
-	    XGELL_DEVCONF_BUFSIZE,
-	    buf, &retsize);
+	    XGELL_DEVCONF_BUFSIZE, buf, &retsize);
 	if (status != XGE_HAL_OK) {
 		kmem_free(buf, XGELL_DEVCONF_BUFSIZE);
 		xge_debug_ll(XGE_ERR, "device_config_read(): status %d",
@@ -2349,6 +2713,9 @@ xgell_device_register(xgelldev_t *lldev, xgell_config_t *config)
 	mac_register_t *macp = NULL;
 	xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh;
 
+	/*
+	 * Initialize some NDD interface for internal debug.
+	 */
 	if (nd_load(&lldev->ndp, "pciconf", xgell_pciconf_get, NULL,
 	    (caddr_t)lldev) == B_FALSE)
 		goto xgell_ndd_fail;
@@ -2393,11 +2760,11 @@ xgell_device_register(xgelldev_t *lldev, xgell_config_t *config)
 	macp->m_min_sdu = 0;
 	macp->m_max_sdu = hldev->config.mtu;
 	macp->m_margin = VLAN_TAGSZ;
+	macp->m_v12n = MAC_VIRT_LEVEL1;
+
 	/*
-	 * Finally, we're ready to register ourselves with the Nemo
-	 * interface; if this succeeds, we're all ready to start()
+	 * MAC Registration.
 	 */
-
 	if (mac_register(macp, &lldev->mh) != 0)
 		goto xgell_register_fail;
 
diff --git a/usr/src/uts/common/io/xge/drv/xgell.h b/usr/src/uts/common/io/xge/drv/xgell.h
index aa8bcc43ff..93845bb655 100644
--- a/usr/src/uts/common/io/xge/drv/xgell.h
+++ b/usr/src/uts/common/io/xge/drv/xgell.h
@@ -60,7 +60,7 @@
 #include <sys/pattr.h>
 #include <sys/strsun.h>
 
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 
 #ifdef __cplusplus
@@ -69,11 +69,6 @@ extern "C" {
 
 #define	XGELL_DESC		"Xframe I/II 10Gb Ethernet"
 #define	XGELL_IFNAME		"xge"
-#define	XGELL_TX_LEVEL_LOW	8
-#define	XGELL_TX_LEVEL_HIGH	32
-#define	XGELL_TX_LEVEL_CHECK	3
-#define	XGELL_MAX_RING_DEFAULT	8
-#define	XGELL_MAX_FIFO_DEFAULT	1
 
 #include <xgehal.h>
 
@@ -93,25 +88,64 @@ extern "C" {
 #define	XGELL_RX_BUFFER_TOTAL		XGE_HAL_RING_RXDS_PER_BLOCK(1) * 6
 #define	XGELL_RX_BUFFER_POST_HIWAT	XGE_HAL_RING_RXDS_PER_BLOCK(1) * 5
 
-/* Control driver to copy or DMA received packets */
-#define	XGELL_RX_DMA_LOWAT		256
+/*
+ * Multiple rings configuration
+ */
+#define	XGELL_RX_RING_MAIN			0
+#define	XGELL_TX_RING_MAIN			0
+
+#define	XGELL_RX_RING_NUM_MIN			1
+#define	XGELL_TX_RING_NUM_MIN			1
+#define	XGELL_RX_RING_NUM_MAX			8
+#define	XGELL_TX_RING_NUM_MAX			1 /* TODO */
+#define	XGELL_RX_RING_NUM_DEFAULT		XGELL_RX_RING_NUM_MAX
+#define	XGELL_TX_RING_NUM_DEFAULT		XGELL_TX_RING_NUM_MAX
+
+#define	XGELL_MINTR_NUM_MIN			1
+#define	XGELL_MINTR_NUM_MAX			\
+	(XGELL_RX_RING_NUM_MAX + XGELL_TX_RING_NUM_MAX + 1)
+#define	XGELL_MINTR_NUM_DEFAULT			XGELL_MINTR_NUM_MAX
+
+#define	XGELL_CONF_GROUP_POLICY_BASIC		0
+#define	XGELL_CONF_GROUP_POLICY_VIRT		1
+#define	XGELL_CONF_GROUP_POLICY_PERF		2
+#if 0
+#if defined(__sparc)
+#define	XGELL_CONF_GROUP_POLICY_DEFAULT		XGELL_CONF_GROUP_POLICY_PERF
+#else
+#define	XGELL_CONF_GROUP_POLICY_DEFAULT		XGELL_CONF_GROUP_POLICY_VIRT
+#endif
+#else
+/*
+ * The _PERF configuration enable a fat group of all rx rings, as approachs
+ * better fanout performance of the primary interface.
+ */
+#define	XGELL_CONF_GROUP_POLICY_DEFAULT		XGELL_CONF_GROUP_POLICY_PERF
+#endif
 
-#define	XGELL_RING_MAIN_QID		0
+#define	XGELL_TX_LEVEL_LOW	8
+#define	XGELL_TX_LEVEL_HIGH	32
+#define	XGELL_TX_LEVEL_CHECK	3
+#define	XGELL_MAX_RING_DEFAULT	8
+#define	XGELL_MAX_FIFO_DEFAULT	1
 
-#if defined(__x86)
-#define	XGELL_TX_DMA_LOWAT		128
+/* Control driver to copy or DMA inbound/outbound packets */
+#if defined(__sparc)
+#define	XGELL_RX_DMA_LOWAT			256
+#define	XGELL_TX_DMA_LOWAT			512
 #else
-#define	XGELL_TX_DMA_LOWAT		512
+#define	XGELL_RX_DMA_LOWAT			256
+#define	XGELL_TX_DMA_LOWAT			128
 #endif
 
 /*
  * Try to collapse up to XGELL_RX_PKT_BURST packets into single mblk
  * sequence before mac_rx() is called.
  */
-#define	XGELL_RX_PKT_BURST		32
+#define	XGELL_RX_PKT_BURST			32
 
 /* About 1s */
-#define	XGE_DEV_POLL_TICKS drv_usectohz(1000000)
+#define	XGE_DEV_POLL_TICKS			drv_usectohz(1000000)
 
 #define	XGELL_LSO_MAXLEN			65535
 #define	XGELL_CONF_ENABLE_BY_DEFAULT		1
@@ -157,6 +191,7 @@ extern "C" {
 #define	XGE_HAL_DEFAULT_RX_TIMER_AC_EN		1
 #define	XGE_HAL_DEFAULT_RX_TIMER_VAL		384
 
+#define	XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_A	1024
 #define	XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_J	2048
 #define	XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_N	4096
 #define	XGE_HAL_DEFAULT_FIFO_QUEUE_INTR		0
@@ -171,15 +206,14 @@ extern "C" {
  */
 #define	XGE_HAL_DEFAULT_FIFO_ALIGNMENT_SIZE	4096
 #define	XGE_HAL_DEFAULT_FIFO_MAX_ALIGNED_FRAGS	1
-#if defined(__x86)
-#define	XGE_HAL_DEFAULT_FIFO_FRAGS		128
-#else
+#if defined(__sparc)
 #define	XGE_HAL_DEFAULT_FIFO_FRAGS		64
+#else
+#define	XGE_HAL_DEFAULT_FIFO_FRAGS		128
 #endif
 #define	XGE_HAL_DEFAULT_FIFO_FRAGS_THRESHOLD	18
 
-#define	XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS_J	2
-#define	XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS_N	2
+#define	XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS	2
 #define	XGE_HAL_RING_QUEUE_BUFFER_MODE_DEFAULT	1
 #define	XGE_HAL_DEFAULT_BACKOFF_INTERVAL_US	64
 #define	XGE_HAL_DEFAULT_RING_PRIORITY		0
@@ -202,18 +236,15 @@ extern "C" {
 #define	XGE_HAL_DEFAULT_STATS_REFRESH_TIME	1
 
 #if defined(__sparc)
-#define	XGE_HAL_DEFAULT_MMRB_COUNT		\
-		XGE_HAL_MAX_MMRB_COUNT
-#define	XGE_HAL_DEFAULT_SPLIT_TRANSACTION	\
-		XGE_HAL_EIGHT_SPLIT_TRANSACTION
+#define	XGE_HAL_DEFAULT_MMRB_COUNT		XGE_HAL_MAX_MMRB_COUNT
+#define	XGE_HAL_DEFAULT_SPLIT_TRANSACTION	XGE_HAL_EIGHT_SPLIT_TRANSACTION
 #else
 #define	XGE_HAL_DEFAULT_MMRB_COUNT		1 /* 1k */
-#define	XGE_HAL_DEFAULT_SPLIT_TRANSACTION	\
-		XGE_HAL_TWO_SPLIT_TRANSACTION
+#define	XGE_HAL_DEFAULT_SPLIT_TRANSACTION	XGE_HAL_TWO_SPLIT_TRANSACTION
 #endif
 
 /*
- * default the size of buffers allocated for ndd interface functions
+ * Default the size of buffers allocated for ndd interface functions
  */
 #define	XGELL_STATS_BUFSIZE			8192
 #define	XGELL_PCICONF_BUFSIZE			2048
@@ -222,17 +253,12 @@ extern "C" {
 #define	XGELL_DEVCONF_BUFSIZE			8192
 
 /*
- * xgell_event_e
+ * Multiple mac address definitions
  *
- * This enumeration derived from xgehal_event_e. It extends it
- * for the reason to get serialized context.
+ * We'll use whole MAC Addresses Configuration Memory for unicast addresses,
+ * since current multicast implementation in HAL is by enabling promise mode.
  */
-/* Renamb the macro from HAL */
-#define	XGELL_EVENT_BASE	XGE_LL_EVENT_BASE
-typedef enum xgell_event_e {
-	/* LL events */
-	XGELL_EVENT_RESCHED_NEEDED	= XGELL_EVENT_BASE + 1,
-} xgell_event_e;
+#define	XGE_RX_MULTI_MAC_ADDRESSES_MAX		8 /* per ring group */
 
 typedef struct {
 	int rx_pkt_burst;
@@ -240,24 +266,27 @@ typedef struct {
 	int rx_buffer_post_hiwat;
 	int rx_dma_lowat;
 	int tx_dma_lowat;
-	int msix_enable;
 	int lso_enable;
+	int msix_enable;
+	int grouping;
 } xgell_config_t;
 
-typedef struct xgell_ring xgell_ring_t;
-typedef struct xgell_fifo xgell_fifo_t;
+typedef struct xgell_multi_mac xgell_multi_mac_t;
+typedef struct xgell_rx_ring xgell_rx_ring_t;
+typedef struct xgell_tx_ring xgell_tx_ring_t;
+typedef struct xgelldev xgelldev_t;
 
 typedef struct xgell_rx_buffer_t {
-	struct xgell_rx_buffer_t	*next;
-	void				*vaddr;
-	dma_addr_t			dma_addr;
-	ddi_dma_handle_t		dma_handle;
-	ddi_acc_handle_t		dma_acch;
-	xgell_ring_t			*ring;
-	frtn_t				frtn;
+	struct xgell_rx_buffer_t *next;
+	void			*vaddr;
+	dma_addr_t		dma_addr;
+	ddi_dma_handle_t	dma_handle;
+	ddi_acc_handle_t	dma_acch;
+	xgell_rx_ring_t		*ring;
+	frtn_t			frtn;
 } xgell_rx_buffer_t;
 
-/* Buffer pool for all rings */
+/* Buffer pool for one rx ring */
 typedef struct xgell_rx_buffer_pool_t {
 	uint_t			total;		/* total buffers */
 	uint_t			size;		/* buffer size */
@@ -266,50 +295,92 @@ typedef struct xgell_rx_buffer_pool_t {
 	uint_t			post;		/* posted buffers */
 	uint_t			post_hiwat;	/* hiwat to stop post */
 	spinlock_t		pool_lock;	/* buffer pool lock */
+	boolean_t		live;		/* pool status */
 	xgell_rx_buffer_t	*recycle_head;	/* recycle list's head */
 	xgell_rx_buffer_t	*recycle_tail;	/* recycle list's tail */
 	uint_t			recycle;	/* # of rx buffers recycled */
 	spinlock_t		recycle_lock;	/* buffer recycle lock */
 } xgell_rx_buffer_pool_t;
 
-typedef struct xgelldev xgelldev_t;
+struct xgell_multi_mac {
+	int			naddr;		/* total supported addresses */
+	int			naddrfree;	/* free addresses slots */
+	ether_addr_t		mac_addr[XGE_RX_MULTI_MAC_ADDRESSES_MAX];
+	boolean_t		mac_addr_set[XGE_RX_MULTI_MAC_ADDRESSES_MAX];
+};
 
-struct xgell_ring {
-	xge_hal_channel_h	channelh;
-	xgelldev_t		*lldev;
-	mac_resource_handle_t	handle;		/* per ring cookie */
-	xgell_rx_buffer_pool_t	bf_pool;
+typedef uint_t (*intr_func_t)(caddr_t, caddr_t);
+
+typedef struct xgell_intr {
+	uint_t			index;
+	ddi_intr_handle_t	*handle;	/* DDI interrupt handle */
+	intr_func_t		*function;	/* interrupt function */
+	caddr_t			arg;		/* interrupt source */
+} xgell_intr_t;
+
+struct xgell_rx_ring {
+	int			index;
+	boolean_t		live;		/* ring active status */
+	xge_hal_channel_h	channelh;	/* hardware channel */
+	xgelldev_t		*lldev;		/* driver device */
+	mac_ring_handle_t	ring_handle;	/* call back ring handle */
+	mac_group_handle_t	group_handle;	/* call back group handle */
+	uint64_t		ring_gen_num;
+
+	xgell_multi_mac_t	mmac;		/* per group multiple addrs */
+	xgell_rx_buffer_pool_t	bf_pool;	/* per ring buffer pool */
+	int			received_bytes;	/* total received bytes */
+	int			intr_bytes;	/* interrupt received bytes */
+	int			poll_bytes;	/* bytes to be polled up */
+	int			polled_bytes;	/* total polled bytes */
+	mblk_t			*poll_mp;	/* polled messages */
+
+	spinlock_t		ring_lock;	/* per ring lock */
 };
 
-struct xgell_fifo {
-	xge_hal_channel_h	channelh;
-	xgelldev_t		*lldev;
-	int			level_low;
+struct xgell_tx_ring {
+	int			index;
+	boolean_t		live;		/* ring active status */
+	xge_hal_channel_h	channelh;	/* hardware channel */
+	xgelldev_t		*lldev;		/* driver device */
+	mac_ring_handle_t	ring_handle;	/* call back ring handle */
+	int			sent_bytes;	/* bytes sent though the ring */
+
+	boolean_t		need_resched;
 };
 
 struct xgelldev {
-	caddr_t			ndp;
+	volatile int		is_initialized;
+	volatile int		in_reset;
+	kmutex_t		genlock;
 	mac_handle_t		mh;
 	int			instance;
 	dev_info_t		*dev_info;
 	xge_hal_device_h	devh;
-	xgell_ring_t		rings[XGE_HAL_MAX_RING_NUM];
-	xgell_fifo_t		fifos[XGE_HAL_MAX_FIFO_NUM];
-	int			resched_avail;
-	int			resched_send;
-	int			resched_retry;
-	int			tx_copied_max;
-	volatile int		is_initialized;
-	xgell_config_t		config;
-	volatile int		in_reset;
+	caddr_t			ndp;
 	timeout_id_t		timeout_id;
-	kmutex_t		genlock;
+
+	int			init_rx_rings;
+	int			init_tx_rings;
+	int			init_rx_groups;
+
+	int			live_rx_rings;
+	int			live_tx_rings;
+	xgell_rx_ring_t		rx_ring[XGELL_RX_RING_NUM_DEFAULT];
+	xgell_tx_ring_t		tx_ring[XGELL_TX_RING_NUM_DEFAULT];
+
+	int			tx_copied_max;
+
+	xgell_intr_t		intrs[XGELL_MINTR_NUM_DEFAULT];
+
 	ddi_intr_handle_t	*intr_table;
 	uint_t			intr_table_size;
 	int			intr_type;
 	int			intr_cnt;
 	uint_t			intr_pri;
 	int			intr_cap;
+
+	xgell_config_t		config;
 };
 
 typedef struct {
diff --git a/usr/src/uts/common/io/xge/hal/include/xgehal-channel.h b/usr/src/uts/common/io/xge/hal/include/xgehal-channel.h
index 5852bb9e9a..5275da409a 100644
--- a/usr/src/uts/common/io/xge/hal/include/xgehal-channel.h
+++ b/usr/src/uts/common/io/xge/hal/include/xgehal-channel.h
@@ -21,6 +21,11 @@
  * Copyright (c) 2002-2006 Neterion, Inc.
  */
 
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
 #ifndef XGE_HAL_CHANNEL_H
 #define XGE_HAL_CHANNEL_H
 
@@ -69,7 +74,8 @@ typedef enum xge_hal_channel_type_e {
 typedef enum xge_hal_channel_flag_e {
 	XGE_HAL_CHANNEL_FLAG_NONE		= 0x0,
 	XGE_HAL_CHANNEL_FLAG_USE_TX_LOCK	= 0x1,
-	XGE_HAL_CHANNEL_FLAG_FREE_RXD	        = 0x2
+	XGE_HAL_CHANNEL_FLAG_FREE_RXD	        = 0x2,
+	XGE_HAL_CHANNEL_FLAG_USE_RX_POLLING     = 0x4
 } xge_hal_channel_flag_e;
 
 /**
diff --git a/usr/src/uts/common/io/xge/hal/include/xgehal-regs.h b/usr/src/uts/common/io/xge/hal/include/xgehal-regs.h
index e79774e329..f0b0a3520d 100644
--- a/usr/src/uts/common/io/xge/hal/include/xgehal-regs.h
+++ b/usr/src/uts/common/io/xge/hal/include/xgehal-regs.h
@@ -21,6 +21,11 @@
  * Copyright (c) 2002-2006 Neterion, Inc.
  */
 
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
 #ifndef XGE_HAL_REGS_H
 #define XGE_HAL_REGS_H
 
@@ -814,8 +819,8 @@ typedef struct {
 	u64 rmac_cfg_key;
 #define XGE_HAL_RMAC_CFG_KEY(val)               vBIT(val,0,16)
 
-#define XGE_HAL_MAX_MAC_ADDRESSES               64
-#define XGE_HAL_MAC_MC_ALL_MC_ADDR_OFFSET       63
+#define XGE_HAL_MAX_MAC_ADDRESSES               256
+#define XGE_HAL_MAC_MC_ALL_MC_ADDR_OFFSET       255
 #define XGE_HAL_MAX_MAC_ADDRESSES_HERC          256
 #define XGE_HAL_MAC_MC_ALL_MC_ADDR_OFFSET_HERC  255
 
diff --git a/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c b/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c
index 5b70ea1378..d08c1d58bf 100644
--- a/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c
+++ b/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c
@@ -21,6 +21,11 @@
  * Copyright (c) 2002-2006 Neterion, Inc.
  */
 
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
 #ifdef XGE_DEBUG_FP
 #include "xgehal-device.h"
 #endif
@@ -444,7 +449,9 @@ xge_hal_device_poll_rx_channels(xge_hal_device_t *hldev, int *got_rx)
 		if (hldev->terminating)
 			return XGE_HAL_OK;
 		channel	= xge_container_of(item, xge_hal_channel_t,	item);
-		(void) xge_hal_device_poll_rx_channel(channel, got_rx);
+		if (!(channel->flags & XGE_HAL_CHANNEL_FLAG_USE_RX_POLLING)) {
+			(void) xge_hal_device_poll_rx_channel(channel, got_rx);
+		}
 	}
 
 	return XGE_HAL_OK;
@@ -483,6 +490,21 @@ xge_hal_device_poll_tx_channels(xge_hal_device_t *hldev, int *got_tx)
 }
 
 /**
+ *
+ */
+__HAL_STATIC_DEVICE	__HAL_INLINE_DEVICE	void
+xge_hal_device_rx_channel_enable_polling(xge_hal_channel_t *channel)
+{
+	channel->flags |= XGE_HAL_CHANNEL_FLAG_USE_RX_POLLING;
+}
+
+__HAL_STATIC_DEVICE	__HAL_INLINE_DEVICE	void
+xge_hal_device_rx_channel_disable_polling(xge_hal_channel_t *channel)
+{
+	channel->flags &= ~XGE_HAL_CHANNEL_FLAG_USE_RX_POLLING;
+}
+
+/**
  * xge_hal_device_mask_tx -	Mask Tx	interrupts.
  * @hldev: HAL device handle.
  *
diff --git a/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c b/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c
index 346f10b8bc..4cf18c2621 100644
--- a/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c
+++ b/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c
@@ -5044,7 +5044,7 @@ xge_hal_device_macaddr_find(xge_hal_device_t *hldev, macaddr_t wanted)
 		return XGE_HAL_ERR_INVALID_DEVICE;
 	}
 
-	for (i=1; i<XGE_HAL_MAX_MAC_ADDRESSES; i++) {
+	for (i=0; i<XGE_HAL_MAX_MAC_ADDRESSES; i++) {
 		(void) xge_hal_device_macaddr_get(hldev, i, &macaddr);
 		if (!xge_os_memcmp(macaddr, wanted, sizeof(macaddr_t))) {
 			return i;
diff --git a/usr/src/uts/common/os/exacct.c b/usr/src/uts/common/os/exacct.c
index cb8ced5239..43a7298c7b 100644
--- a/usr/src/uts/common/os/exacct.c
+++ b/usr/src/uts/common/os/exacct.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/exacct.h>
 #include <sys/exacct_catalog.h>
 #include <sys/disp.h>
@@ -43,6 +41,7 @@
 #include <sys/sysmacros.h>
 #include <sys/bitmap.h>
 #include <sys/msacct.h>
+#include <sys/mac.h>
 
 /*
  * exacct usage and recording routines
@@ -1163,6 +1162,271 @@ exacct_commit_proc(proc_t *p, int wstat)
 }
 
 static int
+exacct_attach_netstat_item(net_stat_t *ns, ea_object_t *record, int res)
+{
+	int		attached = 1;
+
+	switch (res) {
+	case AC_NET_NAME:
+		(void) ea_attach_item(record, ns->ns_name,
+		    strlen(ns->ns_name) + 1, EXT_STRING | EXD_NET_STATS_NAME);
+		break;
+	case AC_NET_CURTIME:
+		{
+			uint64_t	now;
+			timestruc_t	ts;
+
+			gethrestime(&ts);
+			now = (uint64_t)(ulong_t)ts.tv_sec;
+			(void) ea_attach_item(record,  &now, sizeof (uint64_t),
+			    EXT_UINT64 | EXD_NET_STATS_CURTIME);
+		}
+		break;
+	case AC_NET_IBYTES:
+		(void) ea_attach_item(record, &ns->ns_ibytes,
+		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IBYTES);
+		break;
+	case AC_NET_OBYTES:
+		(void) ea_attach_item(record, &ns->ns_obytes,
+		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OBYTES);
+		break;
+	case AC_NET_IPKTS:
+		(void) ea_attach_item(record, &ns->ns_ipackets,
+		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IPKTS);
+		break;
+	case AC_NET_OPKTS:
+		(void) ea_attach_item(record, &ns->ns_opackets,
+		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OPKTS);
+		break;
+	case AC_NET_IERRPKTS:
+		(void) ea_attach_item(record, &ns->ns_ierrors,
+		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IERRPKTS);
+		break;
+	case AC_NET_OERRPKTS:
+		(void) ea_attach_item(record, &ns->ns_oerrors,
+		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OERRPKTS);
+		break;
+	default:
+		attached = 0;
+	}
+	return (attached);
+}
+
+static int
+exacct_attach_netdesc_item(net_desc_t *nd, ea_object_t *record, int res)
+{
+	int attached = 1;
+
+	switch (res) {
+	case AC_NET_NAME:
+		(void) ea_attach_item(record, nd->nd_name,
+		    strlen(nd->nd_name) + 1, EXT_STRING | EXD_NET_DESC_NAME);
+		break;
+	case AC_NET_DEVNAME:
+		(void) ea_attach_item(record, nd->nd_devname,
+		    strlen(nd->nd_devname) + 1, EXT_STRING |
+		    EXD_NET_DESC_DEVNAME);
+		break;
+	case AC_NET_EHOST:
+		(void) ea_attach_item(record, &nd->nd_ehost,
+		    sizeof (nd->nd_ehost), EXT_RAW | EXD_NET_DESC_EHOST);
+		break;
+	case AC_NET_EDEST:
+		(void) ea_attach_item(record, &nd->nd_edest,
+		    sizeof (nd->nd_edest), EXT_RAW | EXD_NET_DESC_EDEST);
+		break;
+	case AC_NET_VLAN_TPID:
+		(void) ea_attach_item(record, &nd->nd_vlan_tpid,
+		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TPID);
+		break;
+	case AC_NET_VLAN_TCI:
+		(void) ea_attach_item(record, &nd->nd_vlan_tci,
+		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TCI);
+		break;
+	case AC_NET_SAP:
+		(void) ea_attach_item(record, &nd->nd_sap,
+		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_SAP);
+		break;
+	case AC_NET_PRIORITY:
+		(void) ea_attach_item(record, &nd->nd_priority,
+		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_PRIORITY);
+		break;
+	case AC_NET_BWLIMIT:
+		(void) ea_attach_item(record, &nd->nd_bw_limit,
+		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_DESC_BWLIMIT);
+		break;
+	case AC_NET_SADDR:
+		if (nd->nd_isv4) {
+			(void) ea_attach_item(record, &nd->nd_saddr[3],
+			    sizeof (uint32_t), EXT_UINT32 |
+			    EXD_NET_DESC_V4SADDR);
+		} else {
+			(void) ea_attach_item(record, &nd->nd_saddr,
+			    sizeof (nd->nd_saddr), EXT_RAW |
+			    EXD_NET_DESC_V6SADDR);
+		}
+		break;
+	case AC_NET_DADDR:
+		if (nd->nd_isv4) {
+			(void) ea_attach_item(record, &nd->nd_daddr[3],
+			    sizeof (uint32_t), EXT_UINT32 |
+			    EXD_NET_DESC_V4DADDR);
+		} else {
+			(void) ea_attach_item(record, &nd->nd_daddr,
+			    sizeof (nd->nd_daddr), EXT_RAW |
+			    EXD_NET_DESC_V6DADDR);
+		}
+		break;
+	case AC_NET_SPORT:
+		(void) ea_attach_item(record, &nd->nd_sport,
+		    sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_SPORT);
+		break;
+	case AC_NET_DPORT:
+		(void) ea_attach_item(record, &nd->nd_dport,
+		    sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_DPORT);
+		break;
+	case AC_NET_PROTOCOL:
+		(void) ea_attach_item(record, &nd->nd_protocol,
+		    sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_PROTOCOL);
+		break;
+	case AC_NET_DSFIELD:
+		(void) ea_attach_item(record, &nd->nd_dsfield,
+		    sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_DSFIELD);
+		break;
+	default:
+		attached = 0;
+	}
+	return (attached);
+}
+
+static ea_object_t *
+exacct_assemble_net_record(void *ninfo, ulong_t *mask, ea_catalog_t record_type,
+    int what)
+{
+	int		res;
+	int		count;
+	ea_object_t	*record;
+
+	/*
+	 * Assemble usage values into group.
+	 */
+	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
+	for (res = 1, count = 0; res <= AC_NET_MAX_RES; res++)
+		if (BT_TEST(mask, res)) {
+			if (what == EX_NET_LNDESC_REC ||
+			    what == EX_NET_FLDESC_REC) {
+				count += exacct_attach_netdesc_item(
+				    (net_desc_t *)ninfo, record, res);
+			} else {
+				count += exacct_attach_netstat_item(
+				    (net_stat_t *)ninfo, record, res);
+			}
+		}
+	if (count == 0) {
+		ea_free_object(record, EUP_ALLOC);
+		record = NULL;
+	}
+	return (record);
+}
+
+int
+exacct_assemble_net_usage(ac_info_t *ac_net, void *ninfo,
+    int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
+    void *ubuf, size_t ubufsize, size_t *actual, int what)
+{
+	ulong_t		mask[AC_MASK_SZ];
+	ea_object_t	*net_desc;
+	ea_catalog_t	record_type;
+	void		*buf;
+	size_t		bufsize;
+	int		ret;
+
+	mutex_enter(&ac_net->ac_lock);
+	if (ac_net->ac_state == AC_OFF) {
+		mutex_exit(&ac_net->ac_lock);
+		return (ENOTACTIVE);
+	}
+	bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
+	mutex_exit(&ac_net->ac_lock);
+
+	switch (what) {
+	case EX_NET_LNDESC_REC:
+		record_type = EXD_GROUP_NET_LINK_DESC;
+		break;
+	case EX_NET_LNSTAT_REC:
+		record_type = EXD_GROUP_NET_LINK_STATS;
+		break;
+	case EX_NET_FLDESC_REC:
+		record_type = EXD_GROUP_NET_FLOW_DESC;
+		break;
+	case EX_NET_FLSTAT_REC:
+		record_type = EXD_GROUP_NET_FLOW_STATS;
+		break;
+	}
+
+	net_desc = exacct_assemble_net_record(ninfo, mask, record_type, what);
+	if (net_desc == NULL)
+		return (0);
+
+	/*
+	 * Pack object into buffer and pass to callback.
+	 */
+	bufsize = ea_pack_object(net_desc, NULL, 0);
+	buf = kmem_alloc(bufsize, KM_NOSLEEP);
+	if (buf == NULL)
+		return (ENOMEM);
+
+	(void) ea_pack_object(net_desc, buf, bufsize);
+
+	ret = callback(ac_net, ubuf, ubufsize, buf, bufsize, actual);
+
+	/*
+	 * Free all previously allocations.
+	 */
+	kmem_free(buf, bufsize);
+	ea_free_object(net_desc, EUP_ALLOC);
+	return (ret);
+}
+
+int
+exacct_commit_netinfo(void *arg, int what)
+{
+	size_t			size;
+	ulong_t			mask[AC_MASK_SZ];
+	struct exacct_globals	*acg;
+	ac_info_t		*ac_net;
+
+	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
+		/*
+		 * acctctl module not loaded. Nothing to do.
+		 */
+		return (ENOTACTIVE);
+	}
+
+	/*
+	 * Even though each zone nominally has its own flow accounting settings
+	 * (ac_flow), these are only maintained by and for the global zone.
+	 *
+	 * If this were to change in the future, this function should grow a
+	 * second zoneid (or zone) argument, and use the corresponding zone's
+	 * settings rather than always using those of the global zone.
+	 */
+	acg = zone_getspecific(exacct_zone_key, global_zone);
+	ac_net = &acg->ac_net;
+
+	mutex_enter(&ac_net->ac_lock);
+	if (ac_net->ac_state == AC_OFF) {
+		mutex_exit(&ac_net->ac_lock);
+		return (ENOTACTIVE);
+	}
+	bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
+	mutex_exit(&ac_net->ac_lock);
+
+	return (exacct_assemble_net_usage(ac_net, arg, exacct_commit_callback,
+	    NULL, 0, &size, what));
+}
+
+static int
 exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res)
 {
 	int attached = 1;
diff --git a/usr/src/uts/common/inet/ip/ip_cksum.c b/usr/src/uts/common/os/ip_cksum.c
index 3b5b3435d9..722c793b79 100644
--- a/usr/src/uts/common/inet/ip/ip_cksum.c
+++ b/usr/src/uts/common/os/ip_cksum.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,13 +19,11 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/inttypes.h>
 #include <sys/systm.h>
@@ -36,6 +33,7 @@
 #include <sys/ddi.h>
 #include <sys/vtrace.h>
 #include <inet/sctp_crc32.h>
+#include <inet/ip.h>
 
 #include <sys/multidata.h>
 #include <sys/multidata_impl.h>
@@ -154,14 +152,14 @@ ip_cksum(mblk_t *mp, int offset, uint_t sum)
 				 */
 				if (mlen)
 					mlen += dp->db_cksumend
-						- dp->db_cksumstuff;
+					    - dp->db_cksumstuff;
 				else {
 					w = (ushort_t *)(mp->b_rptr +
 					    dp->db_cksumstuff);
 					if (is_odd(w))
 						goto slow;
 					mlen = dp->db_cksumend
-						- dp->db_cksumstuff;
+					    - dp->db_cksumstuff;
 				}
 			} else if (mlen == 0)
 				return (psum);
@@ -239,7 +237,7 @@ slow1:
 				int odd;
 			douio:
 				odd = is_odd(dp->db_cksumstuff -
-						dp->db_cksumstart);
+				    dp->db_cksumstart);
 				if (pmlen == -1) {
 					/*
 					 * Previous mlen was odd, so swap
@@ -262,7 +260,7 @@ slow1:
 					 */
 					if (mlen)
 						mlen += dp->db_cksumend
-							- dp->db_cksumstuff;
+						    - dp->db_cksumstuff;
 					else {
 						w = (ushort_t *)(mp->b_rptr +
 						    dp->db_cksumstuff);
@@ -385,7 +383,7 @@ done:
 	sum = (sum & 0xFFFF) + (sum >> 16);
 	sum = (sum & 0xFFFF) + (sum >> 16);
 	TRACE_3(TR_FAC_IP, TR_IP_CKSUM_END,
-		"ip_cksum_end:(%S) type %d (%X)", "ip_cksum", 1, sum);
+	    "ip_cksum_end:(%S) type %d (%X)", "ip_cksum", 1, sum);
 	return (sum);
 }
 
@@ -537,3 +535,30 @@ ip_md_cksum(pdesc_t *pd, int offset, uint_t sum)
 
 	return (sum);
 }
+
+/* Return the IP checksum for the IP header at "iph". */
+uint16_t
+ip_csum_hdr(ipha_t *ipha)
+{
+	uint16_t	*uph;
+	uint32_t	sum;
+	int		opt_len;
+
+	opt_len = (ipha->ipha_version_and_hdr_length & 0xF) -
+	    IP_SIMPLE_HDR_LENGTH_IN_WORDS;
+	uph = (uint16_t *)ipha;
+	sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
+	    uph[5] + uph[6] + uph[7] + uph[8] + uph[9];
+	if (opt_len > 0) {
+		do {
+			sum += uph[10];
+			sum += uph[11];
+			uph += 2;
+		} while (--opt_len);
+	}
+	sum = (sum & 0xFFFF) + (sum >> 16);
+	sum = ~(sum + (sum >> 16)) & 0xFFFF;
+	if (sum == 0xffff)
+		sum = 0;
+	return ((uint16_t)sum);
+}
diff --git a/usr/src/uts/common/os/modhash.c b/usr/src/uts/common/os/modhash.c
index 3c63231253..4d52a9eb66 100644
--- a/usr/src/uts/common/os/modhash.c
+++ b/usr/src/uts/common/os/modhash.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * mod_hash: flexible hash table implementation.
  *
@@ -816,6 +814,22 @@ mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val,
 	return (res);
 }
 
+int
+mod_hash_find_cb_rval(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val,
+    int (*find_cb)(mod_hash_key_t, mod_hash_val_t), int *cb_rval)
+{
+	int res;
+
+	rw_enter(&hash->mh_contents, RW_READER);
+	res = i_mod_hash_find_nosync(hash, key, val);
+	if (res == 0) {
+		*cb_rval = find_cb(key, *val);
+	}
+	rw_exit(&hash->mh_contents);
+
+	return (res);
+}
+
 void
 i_mod_hash_walk_nosync(mod_hash_t *hash,
     uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg)
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index 23c5e91475..b71b956f8a 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -54,6 +54,7 @@
 #include <sys/vfs.h>
 #include <sys/mntent.h>
 #include <sys/contract_impl.h>
+#include <sys/dld_ioc.h>
 
 /*
  * There are two possible layers of privilege routines and two possible
@@ -2267,3 +2268,23 @@ secpolicy_xvm_control(const cred_t *cr)
 		return (EPERM);
 	return (0);
 }
+
+/*
+ * secpolicy_dld_ioctl
+ *
+ * Determine if the subject has permission to use certain dld ioctls.
+ * Each ioctl should require a limited number of privileges. A large
+ * number indicates a poor design.
+ */
+int
+secpolicy_dld_ioctl(const cred_t *cr, const char *dld_priv, const char *msg)
+{
+	int rv;
+
+	if ((rv = priv_getbyname(dld_priv, 0)) >= 0) {
+		return (PRIV_POLICY(cr, rv, B_FALSE, EPERM, msg));
+	}
+	/* priv_getbyname() returns -ve errno */
+	return (-rv);
+
+}
diff --git a/usr/src/uts/common/inet/sctp_crc32.c b/usr/src/uts/common/os/sctp_crc32.c
index 21dcaf18fd..38e049e440 100644
--- a/usr/src/uts/common/inet/sctp_crc32.c
+++ b/usr/src/uts/common/os/sctp_crc32.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 
 /*
@@ -68,7 +65,7 @@ static uint32_t
 flip32(uint32_t w)
 {
 	return (((w >> 24) | ((w >> 8) & 0xff00) | ((w << 8) & 0xff0000) |
-		(w << 24)));
+	    (w << 24)));
 }
 
 #endif
diff --git a/usr/src/uts/common/os/space.c b/usr/src/uts/common/os/space.c
index 6edebecdfe..6ed5e749f1 100644
--- a/usr/src/uts/common/os/space.c
+++ b/usr/src/uts/common/os/space.c
@@ -359,23 +359,14 @@ space_free(char *key)
 
 const uint32_t crc32_table[256] = { CRC32_TABLE };
 
-
 /*
- * We need to fanout load from NIC which can overwhelm a single
- * CPU. A 10Gb NIC interrupting a single CPU is a good example.
- * Instead of fanning out to random CPUs, it a big performance
- * win if you can fanout to the threads on the same core (niagara)
- * that is taking interrupts.
- *
- * We need a better mechanism to figure out the other threads on
- * the same core or cores on the same chip which share caches etc.
- * but for time being, this will suffice.
+ * We need to fanout load from NIC which can overwhelm a single CPU.
+ * This becomes especially important on systems having slow CPUs
+ * (sun4v architecture). mac_soft_ring_enable is false on all
+ * systems except sun4v. On sun4v, they get enabled by default (see
+ * sun4v/os/mach_startup.c).
  */
-#define	NUMBER_OF_THREADS_PER_CPU	4
-uint_t		ip_threads_per_cpu = NUMBER_OF_THREADS_PER_CPU;
-
-/* Global flag to enable/disable soft ring facility */
-boolean_t	ip_squeue_soft_ring = B_FALSE;
+boolean_t	mac_soft_ring_enable = B_FALSE;
 
 /*
  * Global iscsi boot prop
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index cd8a0a2a62..442ced2b51 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -27,8 +27,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/param.h>
@@ -8450,18 +8448,25 @@ hcksum_retrieve(mblk_t *mp, multidata_t *mmd, pdesc_t *pd,
 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA);
 	if (mp->b_datap->db_type == M_DATA) {
 		if (flags != NULL) {
-			*flags = DB_CKSUMFLAGS(mp);
-			if (*flags & HCK_PARTIALCKSUM) {
-				if (start != NULL)
-					*start = (uint32_t)DB_CKSUMSTART(mp);
-				if (stuff != NULL)
-					*stuff = (uint32_t)DB_CKSUMSTUFF(mp);
-				if (end != NULL)
-					*end = (uint32_t)DB_CKSUMEND(mp);
+			*flags = DB_CKSUMFLAGS(mp) & (HCK_IPV4_HDRCKSUM |
+			    HCK_PARTIALCKSUM | HCK_FULLCKSUM |
+			    HCK_FULLCKSUM_OK);
+			if ((*flags & (HCK_PARTIALCKSUM |
+			    HCK_FULLCKSUM)) != 0) {
 				if (value != NULL)
 					*value = (uint32_t)DB_CKSUM16(mp);
-			} else if ((*flags & HW_LSO) && (value != NULL))
-				*value = (uint32_t)DB_LSOMSS(mp);
+				if ((*flags & HCK_PARTIALCKSUM) != 0) {
+					if (start != NULL)
+						*start =
+						    (uint32_t)DB_CKSUMSTART(mp);
+					if (stuff != NULL)
+						*stuff =
+						    (uint32_t)DB_CKSUMSTUFF(mp);
+					if (end != NULL)
+						*end =
+						    (uint32_t)DB_CKSUMEND(mp);
+				}
+			}
 		}
 	} else {
 		pattrinfo_t hck_attr = {PATTR_HCKSUM};
@@ -8488,6 +8493,28 @@ hcksum_retrieve(mblk_t *mp, multidata_t *mmd, pdesc_t *pd,
 	}
 }
 
+void
+lso_info_set(mblk_t *mp, uint32_t mss, uint32_t flags)
+{
+	ASSERT(DB_TYPE(mp) == M_DATA);
+
+	/* Set the flags */
+	DB_LSOFLAGS(mp) |= flags;
+	DB_LSOMSS(mp) = mss;
+}
+
+void
+lso_info_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
+{
+	ASSERT(DB_TYPE(mp) == M_DATA);
+
+	if (flags != NULL) {
+		*flags = DB_CKSUMFLAGS(mp) & HW_LSO;
+		if ((*flags != 0) && (mss != NULL))
+			*mss = (uint32_t)DB_LSOMSS(mp);
+	}
+}
+
 /*
  * Checksum buffer *bp for len bytes with psum partial checksum,
  * or 0 if none, and return the 16 bit partial checksum.
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 5fe7393f56..cecccf50ab 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -191,6 +191,7 @@ CHKHDRS=			\
 	dld_impl.h		\
 	dld_ioc.h		\
 	dls.h			\
+	dls_mgmt.h		\
 	dls_impl.h		\
 	dma_i8237A.h            \
 	dnlc.h			\
@@ -353,7 +354,13 @@ CHKHDRS=			\
 	lwp_upimutex_impl.h	\
 	lpif.h			\
 	mac.h			\
+	mac_client.h		\
+	mac_client_impl.h	\
+	mac_flow.h		\
+	mac_flow_impl.h		\
 	mac_impl.h		\
+	mac_provider.h		\
+	mac_soft_ring.h		\
 	machelf.h		\
 	map.h			\
 	md4.h			\
@@ -418,6 +425,7 @@ CHKHDRS=			\
 	pci.h			\
 	pcie.h			\
 	pci_impl.h		\
+	pci_tools.h		\
 	pcmcia.h		\
 	pctypes.h		\
 	pem.h			\
diff --git a/usr/src/uts/common/sys/acctctl.h b/usr/src/uts/common/sys/acctctl.h
index 5019d36c4c..1dfa8e8577 100644
--- a/usr/src/uts/common/sys/acctctl.h
+++ b/usr/src/uts/common/sys/acctctl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_ACCTCTL_H
 #define	_SYS_ACCTCTL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/bitmap.h>
 #include <sys/sysmacros.h>
@@ -44,10 +41,11 @@ extern "C" {
 /*
  * modes
  */
-#define	AC_PROC		(0x1 << 28)	/* change process accounting settings */
-#define	AC_TASK		(0x2 << 28)	/* change task accounting settings */
-#define	AC_FLOW		(0x4 << 28)	/* change flow accounting settings */
-#define	AC_MODE(x)	((x) & 0xf0000000)
+#define	AC_PROC		(0x1 << 24)	/* change process accounting settings */
+#define	AC_TASK		(0x2 << 24)	/* change task accounting settings */
+#define	AC_FLOW		(0x4 << 24)	/* change flow accounting settings */
+#define	AC_NET		(0x8 << 24)	/* change network accounting settings */
+#define	AC_MODE(x)	((x) & 0xff000000)
 
 /*
  * options
@@ -58,7 +56,7 @@ extern "C" {
 #define	AC_RES_GET	(0x08)	/* get a list of enabled resources */
 #define	AC_STATE_SET	(0x10)	/* set accounting mode state (on/off) */
 #define	AC_STATE_GET	(0x20)	/* get accounting mode state */
-#define	AC_OPTION(x)	((x) & 0x0fffffff)
+#define	AC_OPTION(x)	((x) & 0x00ffffff)
 
 /*
  * Process accounting resource IDs
@@ -113,8 +111,36 @@ extern "C" {
 #define	AC_FLOW_ANAME		13	/* action instance name */
 #define	AC_FLOW_MAX_RES		13	/* must be equal to the number above */
 
-#define	AC_MAX_RES_TMP		MAX(AC_PROC_MAX_RES, AC_TASK_MAX_RES)
-#define	AC_MAX_RES		MAX(AC_MAX_RES_TMP, AC_FLOW_MAX_RES)
+/*
+ * Network accounting resource IDs
+ */
+#define	AC_NET_NAME		1	/* flow name */
+#define	AC_NET_EHOST		2	/* ethernet source address */
+#define	AC_NET_EDEST		3	/* ethernet destination address */
+#define	AC_NET_VLAN_TPID	4	/* VLAN protocol ID */
+#define	AC_NET_VLAN_TCI		5	/* VLAN tag control info. */
+#define	AC_NET_SAP		6	/* SAP */
+#define	AC_NET_PRIORITY		7	/* Priority */
+#define	AC_NET_BWLIMIT		8	/* Bandwidth limit */
+#define	AC_NET_DEVNAME		9	/* Device name */
+#define	AC_NET_SADDR		10	/* Source IP address */
+#define	AC_NET_DADDR		11	/* Dest IP address */
+#define	AC_NET_SPORT		12	/* Source Port */
+#define	AC_NET_DPORT		13	/* Dest Port */
+#define	AC_NET_PROTOCOL		14	/* Protocol */
+#define	AC_NET_DSFIELD		15	/* DiffServ field */
+#define	AC_NET_CURTIME		16	/* Current Time */
+#define	AC_NET_IBYTES		17	/* Inbound Bytes */
+#define	AC_NET_OBYTES		18	/* Outbound Bytes */
+#define	AC_NET_IPKTS		19	/* Inbound Packets */
+#define	AC_NET_OPKTS		20	/* Outbound Packets */
+#define	AC_NET_IERRPKTS		21	/* Inbound Error Packets */
+#define	AC_NET_OERRPKTS		22	/* Outbound Error Packets */
+#define	AC_NET_MAX_RES		22	/* must be equal to the number above */
+
+#define	AC_MAX_RES		\
+	MAX(MAX(MAX(AC_PROC_MAX_RES, AC_TASK_MAX_RES), AC_FLOW_MAX_RES), \
+	AC_NET_MAX_RES)
 #define	AC_MASK_SZ		BT_BITOUL(AC_MAX_RES + 1)
 
 /*
@@ -150,7 +176,7 @@ extern zone_key_t exacct_zone_key;
 
 /*
  * Per-zone exacct settings.  Each zone may have its own settings for
- * process, task, and flow accounting.
+ * process, task, flow, and network accounting.
  *
  * Per-zone flow accounting has not yet been implemented, so this
  * provides zones with the view that flow accounting in the zone hasn't
@@ -164,6 +190,7 @@ struct exacct_globals {
 	ac_info_t	ac_task;
 	ac_info_t	ac_proc;
 	ac_info_t	ac_flow;
+	ac_info_t	ac_net;
 	list_node_t	ac_link;
 };
 
diff --git a/usr/src/uts/common/sys/aggr.h b/usr/src/uts/common/sys/aggr.h
index 740ac7f6f9..c63cc9e99f 100644
--- a/usr/src/uts/common/sys/aggr.h
+++ b/usr/src/uts/common/sys/aggr.h
@@ -28,9 +28,8 @@
 
 #include <sys/types.h>
 #include <sys/ethernet.h>
-#include <sys/mac.h>
-#include <sys/dls.h>
 #include <sys/param.h>
+#include <sys/mac.h>
 #include <sys/dld_ioc.h>
 
 #ifdef	__cplusplus
@@ -38,7 +37,7 @@ extern "C" {
 #endif
 
 /*
- * Note that the datastructures defined here define an ioctl interface
+ * Note that the data structures defined here define an ioctl interface
  * that is shared betwen user and kernel space.  The aggr driver thus
  * assumes that the structures have identical layout and size when
  * compiled in either IPL32 or LP64.
diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h
index 62fe0de59b..a1f7e82849 100644
--- a/usr/src/uts/common/sys/aggr_impl.h
+++ b/usr/src/uts/common/sys/aggr_impl.h
@@ -27,8 +27,10 @@
 #define	_SYS_AGGR_IMPL_H
 
 #include <sys/types.h>
-#include <sys/mac.h>
 #include <sys/mac_ether.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
 #include <sys/aggr_lacp.h>
 
 #ifdef	__cplusplus
@@ -46,6 +48,33 @@ extern "C" {
 #define	AGGR_MODIFY_LACP_TIMER		0x08
 
 /*
+ * Possible value of aggr_rseudo_rx_ring_t.arr_flags. Set when the ring entry
+ * in the pseudo RX group is used.
+ */
+#define	MAC_PSEUDO_RING_INUSE	0x01
+
+typedef struct aggr_unicst_addr_s {
+	uint8_t				aua_addr[ETHERADDRL];
+	struct aggr_unicst_addr_s	*aua_next;
+} aggr_unicst_addr_t;
+
+typedef struct aggr_pseudo_rx_ring_s {
+	mac_ring_handle_t	arr_rh;	/* filled in by aggr_fill_ring() */
+	struct aggr_port_s	*arr_port;
+	mac_ring_handle_t	arr_hw_rh;
+	uint_t			arr_flags;
+	uint64_t		arr_gen;
+} aggr_pseudo_rx_ring_t;
+
+typedef struct aggr_pseudo_rx_group_s {
+	struct aggr_grp_s	*arg_grp; /* filled in by aggr_fill_group() */
+	mac_group_handle_t	arg_gh;   /* filled in by aggr_fill_group() */
+	aggr_unicst_addr_t	*arg_macaddr;
+	aggr_pseudo_rx_ring_t	arg_rings[MAX_RINGS_PER_GROUP];
+	uint_t			arg_ring_cnt;
+} aggr_pseudo_rx_group_t;
+
+/*
  * A link aggregation MAC port.
  * Note that lp_next is protected by the lg_lock of the group the
  * port is part of.
@@ -63,13 +92,13 @@ typedef struct aggr_port_s {
 			lp_collector_enabled : 1,
 			lp_promisc_on : 1,
 			lp_no_link_update : 1,
-			lp_pad_bits : 27;
-	uint32_t	lp_closing;
+			lp_grp_added : 1,
+			lp_closing : 1,
+			lp_pad_bits : 25;
 	mac_handle_t	lp_mh;
+	mac_client_handle_t lp_mch;
 	const mac_info_t *lp_mip;
 	mac_notify_handle_t lp_mnh;
-	mac_rx_handle_t	lp_mrh;
-	krwlock_t	lp_lock;
 	uint_t		lp_tx_idx;		/* idx in group's tx array */
 	uint64_t	lp_ifspeed;
 	link_state_t	lp_link_state;
@@ -78,15 +107,15 @@ typedef struct aggr_port_s {
 	uint64_t	lp_ether_stat[ETHER_NSTAT];
 	aggr_lacp_port_t lp_lacp;		/* LACP state */
 	lacp_stats_t	lp_lacp_stats;
-	const mac_txinfo_t *lp_txinfo;
 	uint32_t	lp_margin;
-} aggr_port_t;
+	mac_promisc_handle_t lp_mphp;
+	mac_unicast_handle_t lp_mah;
 
-typedef struct lg_mcst_addr_s	lg_mcst_addr_t;
-struct lg_mcst_addr_s {
-	lg_mcst_addr_t	*lg_mcst_nextp;
-	uint8_t		lg_mcst_addr[MAXMACADDRLEN];
-};
+	/* List of non-primary addresses that requires promiscous mode set */
+	aggr_unicst_addr_t	*lp_prom_addr;
+	/* handle of the underlying HW RX group */
+	mac_group_handle_t	lp_hwgh;
+} aggr_port_t;
 
 /*
  * A link aggregation group.
@@ -105,7 +134,6 @@ struct lg_mcst_addr_s {
  *
  */
 typedef struct aggr_grp_s {
-	krwlock_t	lg_lock;
 	datalink_id_t	lg_linkid;
 	uint16_t	lg_key;			/* key (group port number) */
 	uint32_t	lg_refs;		/* refcount */
@@ -116,16 +144,15 @@ typedef struct aggr_grp_s {
 			lg_addr_fixed : 1,	/* fixed MAC address? */
 			lg_started : 1,		/* group started? */
 			lg_promisc : 1,		/* in promiscuous mode? */
-			lg_gldv3_polling : 1,
 			lg_zcopy : 1,
 			lg_vlan : 1,
 			lg_force : 1,
-			lg_pad_bits : 8;
+			lg_pad_bits : 9;
 	aggr_port_t	*lg_ports;		/* list of configured ports */
 	aggr_port_t	*lg_mac_addr_port;
 	mac_handle_t	lg_mh;
-	uint_t		lg_rx_resources;
 	uint_t		lg_nattached_ports;
+	krwlock_t	lg_tx_lock;
 	uint_t		lg_ntx_ports;
 	aggr_port_t	**lg_tx_ports;		/* array of tx ports */
 	uint_t		lg_tx_ports_size;	/* size of lg_tx_ports */
@@ -140,14 +167,32 @@ typedef struct aggr_grp_s {
 	uint32_t	lg_hcksum_txflags;
 	uint_t		lg_max_sdu;
 	uint32_t	lg_margin;
-	lg_mcst_addr_t	*lg_mcst_list; /* A list of multicast addresses */
-} aggr_grp_t;
 
-#define	AGGR_LACP_LOCK_WRITER(grp) rw_enter(&(grp)->aggr.gl_lock, RW_WRITER);
-#define	AGGR_LACP_UNLOCK(grp)	rw_exit(&(grp)->aggr.gl_lock);
-#define	AGGR_LACP_LOCK_HELD_WRITER(grp)	RW_WRITE_HELD(&(grp)->aggr.gl_lock)
-#define	AGGR_LACP_LOCK_READER(grp) rw_enter(&(grp)->aggr.gl_lock, RW_READER);
-#define	AGGR_LACP_LOCK_HELD_READER(grp) RW_READ_HELD(&(grp)->aggr.gl_lock)
+	/*
+	 * The following fields are used by the LACP packets processing.
+	 * Specifically, as the LACP packets processing is not performance
+	 * critical, all LACP packets will be handled by a dedicated thread
+	 * instead of in the mac_rx() call. This is to avoid the dead lock
+	 * with mac_unicast_remove(), which holding the mac perimeter of the
+	 * aggr, and wait for the mr_refcnt of the RX ring to drop to zero.
+	 */
+	kmutex_t	lg_lacp_lock;
+	kcondvar_t	lg_lacp_cv;
+	mblk_t		*lg_lacp_head;
+	mblk_t		*lg_lacp_tail;
+	kthread_t	*lg_lacp_rx_thread;
+	boolean_t	lg_lacp_done;
+	aggr_pseudo_rx_group_t	lg_rx_group;
+
+	/*
+	 * The following fields are used by aggr to wait for all the
+	 * aggr_port_notify_cb() and aggr_port_timer_thread() to finish
+	 * before it calls mac_unregister() when the aggr is deleted.
+	 */
+	kmutex_t	lg_port_lock;
+	kcondvar_t	lg_port_cv;
+	int		lg_port_ref;
+} aggr_grp_t;
 
 #define	AGGR_GRP_REFHOLD(grp) {			\
 	atomic_add_32(&(grp)->lg_refs, 1);	\
@@ -195,33 +240,34 @@ extern int aggr_grp_info(datalink_id_t, void *, aggr_grp_info_new_grp_fn_t,
     aggr_grp_info_new_port_fn_t);
 extern void aggr_grp_notify(aggr_grp_t *, uint32_t);
 extern boolean_t aggr_grp_attach_port(aggr_grp_t *, aggr_port_t *);
-extern boolean_t aggr_grp_detach_port(aggr_grp_t *, aggr_port_t *, boolean_t);
+extern boolean_t aggr_grp_detach_port(aggr_grp_t *, aggr_port_t *);
 extern void aggr_grp_port_mac_changed(aggr_grp_t *, aggr_port_t *,
     boolean_t *, boolean_t *);
 extern int aggr_grp_add_ports(datalink_id_t, uint_t, boolean_t,
     laioc_port_t *);
 extern int aggr_grp_rem_ports(datalink_id_t, uint_t, laioc_port_t *);
 extern boolean_t aggr_grp_update_ports_mac(aggr_grp_t *);
-extern int aggr_grp_modify(datalink_id_t, aggr_grp_t *, uint8_t, uint32_t,
-    boolean_t, const uchar_t *, aggr_lacp_mode_t, aggr_lacp_timer_t);
+extern int aggr_grp_modify(datalink_id_t, uint8_t, uint32_t, boolean_t,
+    const uchar_t *, aggr_lacp_mode_t, aggr_lacp_timer_t);
 extern void aggr_grp_multicst_port(aggr_port_t *, boolean_t);
 extern uint_t aggr_grp_count(void);
 
 extern void aggr_port_init(void);
 extern void aggr_port_fini(void);
-extern int aggr_port_create(const datalink_id_t, boolean_t, aggr_port_t **);
+extern int aggr_port_create(aggr_grp_t *, const datalink_id_t, boolean_t,
+    aggr_port_t **);
 extern void aggr_port_delete(aggr_port_t *);
 extern void aggr_port_free(aggr_port_t *);
 extern int aggr_port_start(aggr_port_t *);
 extern void aggr_port_stop(aggr_port_t *);
 extern int aggr_port_promisc(aggr_port_t *, boolean_t);
-extern int aggr_port_unicst(aggr_port_t *, uint8_t *);
+extern int aggr_port_unicst(aggr_port_t *);
 extern int aggr_port_multicst(void *, boolean_t, const uint8_t *);
 extern uint64_t aggr_port_stat(aggr_port_t *, uint_t);
-extern boolean_t aggr_port_notify_link(aggr_grp_t *, aggr_port_t *, boolean_t);
+extern boolean_t aggr_port_notify_link(aggr_grp_t *, aggr_port_t *);
 extern void aggr_port_init_callbacks(aggr_port_t *);
 
-extern void aggr_recv_cb(void *, mac_resource_handle_t, mblk_t *);
+extern void aggr_recv_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t);
 
 extern mblk_t *aggr_m_tx(void *, mblk_t *);
 extern void aggr_send_port_enable(aggr_port_t *);
@@ -236,10 +282,20 @@ extern void aggr_lacp_set_mode(aggr_grp_t *, aggr_lacp_mode_t,
     aggr_lacp_timer_t);
 extern void aggr_lacp_update_mode(aggr_grp_t *, aggr_lacp_mode_t);
 extern void aggr_lacp_update_timer(aggr_grp_t *, aggr_lacp_timer_t);
-extern void aggr_lacp_rx(aggr_port_t *, mblk_t *);
+extern void aggr_lacp_rx_enqueue(aggr_port_t *, mblk_t *);
 extern void aggr_lacp_port_attached(aggr_port_t *);
 extern void aggr_lacp_port_detached(aggr_port_t *);
-extern void aggr_lacp_policy_changed(aggr_grp_t *);
+extern void aggr_port_lacp_set_mode(aggr_grp_t *, aggr_port_t *);
+
+extern void aggr_lacp_rx_thread(void *);
+extern void aggr_recv_lacp(aggr_port_t *, mac_resource_handle_t, mblk_t *);
+
+extern void aggr_grp_port_hold(aggr_port_t *);
+extern void aggr_grp_port_rele(aggr_port_t *);
+extern void aggr_grp_port_wait(aggr_grp_t *);
+
+extern int aggr_port_addmac(aggr_port_t *, const uint8_t *);
+extern void aggr_port_remmac(aggr_port_t *, const uint8_t *);
 
 #endif	/* _KERNEL */
 
diff --git a/usr/src/uts/common/sys/aggr_lacp.h b/usr/src/uts/common/sys/aggr_lacp.h
index ebcc07cb12..ef8c7408ac 100644
--- a/usr/src/uts/common/sys/aggr_lacp.h
+++ b/usr/src/uts/common/sys/aggr_lacp.h
@@ -157,8 +157,6 @@ typedef struct Agg {
 	aggr_lacp_timer_t PeriodicTimer;	/* AGGR_LACP_{LONG,SHORT} */
 	uint64_t	TimeOfLastOperChange;	/* Time in state */
 	boolean_t	ready;			/* Ready_N for all ports TRUE */
-
-	krwlock_t	gl_lock;
 } Agg_t;
 
 /*
@@ -192,6 +190,19 @@ typedef struct state_machine {
 } state_machine_t;
 
 /*
+ * The following three flags are set when specific timer is timed out; used
+ * by the LACP timer handler thread.
+ */
+#define	LACP_PERIODIC_TIMEOUT		0x01
+#define	LACP_WAIT_WHILE_TIMEOUT		0x02
+#define	LACP_CURRENT_WHILE_TIMEOUT	0x04
+/*
+ * Set when the port is being deleted; used to inform the LACP timer handler
+ * thread to exit.
+ */
+#define	LACP_THREAD_EXIT		0x08
+
+/*
  * 802.3ad Variables associated with each port (section 43.4.7)
  */
 typedef struct aggr_lacp_port {
@@ -228,6 +239,10 @@ typedef struct aggr_lacp_port {
 	lacp_timer_t	current_while_timer;
 	lacp_timer_t	periodic_timer;
 	lacp_timer_t	wait_while_timer;
+	uint32_t	lacp_timer_bits;
+	kthread_t	*lacp_timer_thread;
+	kmutex_t	lacp_timer_lock;
+	kcondvar_t	lacp_timer_cv;
 	hrtime_t	time;
 } aggr_lacp_port_t;
 
diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h
index d3663f464f..1510b46123 100644
--- a/usr/src/uts/common/sys/dld.h
+++ b/usr/src/uts/common/sys/dld.h
@@ -38,6 +38,7 @@
 #include <sys/types.h>
 #include <sys/stream.h>
 #include <sys/dld_ioc.h>
+#include <sys/mac_flow.h>
 #include <sys/conf.h>
 #include <sys/sad.h>
 #include <net/if.h>
@@ -84,14 +85,18 @@ extern "C" {
  */
 #define	DLD_DRIVER_NAME		"dld"
 
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
 /*
  * IOCTL codes and data structures.
  */
 #define	DLDIOC_ATTR	DLDIOC(0x03)
 
 typedef struct dld_ioc_attr {
-	datalink_id_t	dia_linkid;
-	uint_t		dia_max_sdu;
+	datalink_id_t		dia_linkid;
+	uint_t			dia_max_sdu;
 } dld_ioc_attr_t;
 
 #define	DLDIOC_VLAN_ATTR	DLDIOC(0x04)
@@ -100,7 +105,6 @@ typedef struct dld_ioc_vlan_attr {
 	uint16_t	div_vid;
 	datalink_id_t	div_linkid;
 	boolean_t	div_force;
-	boolean_t	div_implicit;
 } dld_ioc_vlan_attr_t;
 
 #define	DLDIOC_PHYS_ATTR	DLDIOC(0x05)
@@ -203,15 +207,8 @@ typedef struct dld_ioc_rename {
 typedef struct dld_ioc_zid {
 	zoneid_t	diz_zid;
 	char		diz_link[MAXLINKNAMELEN];
-	boolean_t	diz_is_ppa_hack;
 } dld_ioc_zid_t;
 
-#define	DLDIOC_GETZID  		DLDIOC(0x13)
-typedef struct dld_ioc_getzid {
-	datalink_id_t	dig_linkid;
-	zoneid_t	dig_zid;
-} dld_ioc_getzid_t;
-
 /*
  * data-link autopush configuration.
  */
@@ -221,8 +218,72 @@ struct dlautopush {
 	char	dap_aplist[MAXAPUSH][FMNAMESZ+1];
 };
 
-#define	DLDIOC_SETMACPROP	DLDIOC(0x14)
-#define	DLDIOC_GETMACPROP	DLDIOC(0x15)
+#define	DLDIOC_MACADDRGET	DLDIOC(0x15)
+typedef struct dld_ioc_macaddrget {
+	datalink_id_t	dig_linkid;
+	uint_t		dig_count;
+	uint_t		dig_size;
+} dld_ioc_macaddrget_t;
+
+/* possible flags for dmi_flags below */
+#define	DLDIOCMACADDR_USED	0x1	/* address slot used */
+
+typedef struct dld_macaddrinfo {
+	uint_t		dmi_slot;
+	uint_t		dmi_flags;
+	uint_t		dmi_addrlen;
+	uchar_t		dmi_addr[MAXMACADDRLEN];
+	char		dmi_client_name[MAXNAMELEN];
+	datalink_id_t	dma_client_linkid;
+} dld_macaddrinfo_t;
+
+/*
+ * IOCTL codes and data structures for flowadm.
+ */
+#define	DLDIOC_ADDFLOW		DLDIOC(0x16)
+typedef struct dld_ioc_addflow {
+	datalink_id_t		af_linkid;
+	flow_desc_t		af_flow_desc;
+	mac_resource_props_t	af_resource_props;
+	char			af_name[MAXNAMELEN];
+} dld_ioc_addflow_t;
+
+#define	DLDIOC_REMOVEFLOW	DLDIOC(0x17)
+typedef struct dld_ioc_removeflow {
+	char			rf_name[MAXNAMELEN];
+} dld_ioc_removeflow_t;
+
+#define	DLDIOC_MODIFYFLOW	DLDIOC(0x18)
+typedef struct dld_ioc_modifyflow {
+	char			mf_name[MAXNAMELEN];
+	mac_resource_props_t	mf_resource_props;
+} dld_ioc_modifyflow_t;
+
+#define	DLDIOC_WALKFLOW		DLDIOC(0x19)
+typedef struct dld_ioc_walkflow {
+	datalink_id_t		wf_linkid;
+	char			wf_name[MAXNAMELEN];
+	uint32_t		wf_nflows;
+	uint_t			wf_len;
+} dld_ioc_walkflow_t;
+
+typedef struct dld_flowinfo {
+	datalink_id_t		fi_linkid;
+	flow_desc_t		fi_flow_desc;
+	mac_resource_props_t	fi_resource_props;
+	char			fi_flowname[MAXNAMELEN];
+	uint32_t		fi_pad;
+} dld_flowinfo_t;
+
+#define	DLDIOC_USAGELOG		DLDIOC(0x1a)
+typedef struct dld_ioc_usagelog {
+	mac_logtype_t	ul_type;
+	boolean_t	ul_onoff;
+	uint_t		ul_interval;
+} dld_ioc_usagelog_t;
+
+#define	DLDIOC_SETMACPROP	DLDIOC(0x1b)
+#define	DLDIOC_GETMACPROP	DLDIOC(0x1c)
 #define	MAC_PROP_VERSION	1
 
 typedef struct dld_ioc_macprop_s {
@@ -236,7 +297,111 @@ typedef struct dld_ioc_macprop_s {
 	char		pr_val[1];
 } dld_ioc_macprop_t;
 
+#define	DLDIOC_GETHWGRP		DLDIOC(0x1d)
+
+typedef struct dld_ioc_hwgrpget {
+	datalink_id_t	dih_linkid;
+	uint_t		dih_n_groups;	/* number of groups included in ioc */
+	uint_t		dih_size;
+} dld_ioc_hwgrpget_t;
+
+#define	MAXCLIENTNAMELEN	1024
+typedef struct dld_hwgrpinfo {
+	char	dhi_link_name[MAXLINKNAMELEN];
+	uint_t	dhi_grp_num;
+	uint_t	dhi_grp_type;
+	uint_t	dhi_n_rings;
+	uint_t	dhi_n_clnts;
+	/* XXXX later we should use dhi_n_clnts * MAXNAMELEN for dhi_clnts */
+	char	dhi_clnts[MAXCLIENTNAMELEN];
+} dld_hwgrpinfo_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
 #ifdef _KERNEL
+
+#define	DLD_CAPAB_DIRECT	0x00000001
+#define	DLD_CAPAB_POLL		0x00000002
+#define	DLD_CAPAB_PERIM		0x00000003
+#define	DLD_CAPAB_LSO		0x00000004
+
+#define	DLD_ENABLE		0x00000001
+#define	DLD_DISABLE		0x00000002
+#define	DLD_QUERY		0x00000003
+
+/*
+ * GLDv3 entry point for negotiating capabilities.
+ * This is exposed to IP after negotiation of DL_CAPAB_DLD.
+ *
+ * This function takes the following arguments:
+ * handle: used for identifying the interface to operate on (provided by dld).
+ * type: capability type.
+ * arg: points to a capability-specific structure.
+ * flags: used for indicating whether to enable or disable a capability.
+ *
+ * With this function, capability negotiation is reduced from a multi-step
+ * process to just one single function call.
+ * e.g. the following code would pass 'x' from IP to dld and obtain
+ * arg.output_arg from dld:
+ *
+ * arg.input_arg = x;
+ * rc = (*dld_capab)(handle, DLD_CAPAB_XXX, &arg, DLD_ENABLE);
+ * ill->info1 = arg.output_arg;
+ */
+typedef	int	(*dld_capab_func_t)(void *, uint_t, void *, uint_t);
+
+/*
+ * Direct Tx/Rx capability.
+ */
+typedef struct dld_capab_direct_s {
+	/*
+	 * Rx entry point and handle, owned by IP.
+	 */
+	uintptr_t	di_rx_cf;
+	void		*di_rx_ch;
+
+	/*
+	 * Tx entry points and handle, owned by DLD.
+	 */
+	/* Entry point for transmitting packets */
+	uintptr_t	di_tx_df;
+	void		*di_tx_dh;
+
+	/* flow control notification callback */
+	uintptr_t	di_tx_cb_df; /* callback registration/de-registration */
+	void		*di_tx_cb_dh;
+} dld_capab_direct_t;
+
+/*
+ * Polling/softring capability.
+ */
+#define	POLL_SOFTRING		0x00000001
+typedef struct dld_capab_poll_s {
+	uintptr_t	poll_ring_add_cf;
+	uintptr_t	poll_ring_remove_cf;
+	uintptr_t	poll_ring_quiesce_cf;
+	uintptr_t	poll_ring_restart_cf;
+	uintptr_t	poll_ring_bind_cf;
+	void		*poll_ring_ch;
+	uintptr_t	poll_mac_accept_df;
+	void		*poll_mac_dh;
+} dld_capab_poll_t;
+
+/*
+ * LSO capability
+ */
+/*
+ * Currently supported flags for LSO.
+ */
+#define	DLD_LSO_TX_BASIC_TCP_IPV4	0x01	/* TCP LSO capability */
+
+typedef struct dld_capab_lso_s {
+	uint_t  lso_flags;	/* capability flags */
+	uint_t  lso_max;	/* maximum payload */
+} dld_capab_lso_t;
+
 int	dld_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 int	dld_open(queue_t *, dev_t *, int, int, cred_t *);
 int	dld_close(queue_t *);
@@ -245,6 +410,13 @@ void	dld_wsrv(queue_t *);
 void	dld_init_ops(struct dev_ops *, const char *);
 void	dld_fini_ops(struct dev_ops *);
 int	dld_autopush(dev_t *, struct dlautopush *);
+
+int	dld_add_flow(datalink_id_t, char *, flow_desc_t *,
+    mac_resource_props_t *);
+int	dld_remove_flow(char *);
+int	dld_modify_flow(char *, mac_resource_props_t *);
+int	dld_walk_flow(dld_ioc_walkflow_t *, intptr_t);
+
 #endif
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h
index 8d2138cc52..906fd6fe15 100644
--- a/usr/src/uts/common/sys/dld_impl.h
+++ b/usr/src/uts/common/sys/dld_impl.h
@@ -27,13 +27,12 @@
 #define	_SYS_DLD_IMPL_H
 
 #include <sys/types.h>
-#include <sys/conf.h>
+#include <sys/list.h>
 #include <sys/ethernet.h>
 #include <sys/stream.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
-#include <sys/dls.h>
 #include <sys/dld.h>
+#include <sys/dls_impl.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -57,39 +56,50 @@ typedef enum {
 	DLD_ACTIVE
 } dld_passivestate_t;
 
-typedef struct dld_str	dld_str_t;
-typedef void		(*dld_tx_t)(struct dld_str *, mblk_t *);
-
 /*
- * dld_str_t object definition.
+ * The dld_str_t object definition and protection scheme for each member
+ * is described below. The framework locking mechanism details are described in
+ * mac_impl.h and mac.c
+ *
+ * Write Once Only (WO): Typically these are initialized when the end point
+ * is created or initialized and don't change subsequently
+ *
+ * Serializer (SL): Protected by the Serializer. All modify operations on an
+ * end point go through the serializer. Readers don't care about reading
+ * these fields atomically, or readers also use the serializer to see the
+ * values atomically.
+ *
+ * Lock: kmutex_t or kwrlock_t lock. Modify operations still go through the
+ * serializer, the lock helps synchronize readers with writers.
  */
-struct dld_str {
+
+struct dld_str_s {					/* Protected by */
 	/*
 	 * Major number of the device
 	 */
-	major_t			ds_major;
+	major_t			ds_major;		/* WO */
 
 	/*
 	 * Ephemeral minor number for the object.
 	 */
-	minor_t			ds_minor;
+	minor_t			ds_minor;		/* WO */
 
 	/*
-	 * Read/write queues for the stream which the object represents.
+	 * PPA number this stream is attached to.
 	 */
-	queue_t			*ds_rq;
-	queue_t			*ds_wq;
+	t_uscalar_t		ds_ppa;			/* SL */
 
 	/*
-	 * Lock to protect this structure.
+	 * Read/write queues for the stream which the object represents.
 	 */
-	krwlock_t		ds_lock;
+	queue_t			*ds_rq;			/* WO */
+	queue_t			*ds_wq;			/* WO */
 
 	/*
 	 * Stream is open to DLD_CONTROL (control node) or
 	 * DLD_DLPI (DLS provider) node.
 	 */
-	uint_t			ds_type;
+	uint_t			ds_type;		/* WO */
 
 	/*
 	 * The following fields are only used for DLD_DLPI type objects.
@@ -98,158 +108,123 @@ struct dld_str {
 	/*
 	 * Current DLPI state.
 	 */
-	t_uscalar_t		ds_dlstate;
+	t_uscalar_t		ds_dlstate;		/* ds_lock */
 
 	/*
 	 * DLPI style
 	 */
-	t_uscalar_t		ds_style;
+	t_uscalar_t		ds_style;		/* WO */
 
 	/*
 	 * Currently bound DLSAP.
 	 */
-	uint16_t		ds_sap;
-
-	/*
-	 * Handle of the data-link channel that is used by this object.
-	 */
-	dls_channel_t		ds_dc;
+	uint16_t		ds_sap;			/* SL */
 
 	/*
 	 * Handle of the MAC that is used by the data-link interface.
 	 */
-	mac_handle_t		ds_mh;
-
-	/*
-	 * VLAN identifier of the data-link interface.
-	 */
-	uint16_t		ds_vid;
+	mac_handle_t		ds_mh;			/* SL */
+	mac_client_handle_t	ds_mch;			/* SL */
 
 	/*
 	 * Promiscuity level information.
 	 */
-	uint32_t		ds_promisc;
+	uint32_t		ds_promisc;		/* SL */
+	mac_promisc_handle_t	ds_mph;
+	mac_promisc_handle_t	ds_vlan_mph;
 
 	/*
 	 * Immutable information of the MAC which the channel is using.
 	 */
-	const mac_info_t	*ds_mip;
+	const mac_info_t	*ds_mip;		/* SL */
 
 	/*
 	 * Current packet priority.
 	 */
-	uint_t			ds_pri;
+	uint_t			ds_pri;			/* SL */
 
 	/*
 	 * Handle of our MAC notification callback.
 	 */
-	mac_notify_handle_t	ds_mnh;
+	mac_notify_handle_t	ds_mnh;			/* SL */
 
 	/*
 	 * Set of enabled DL_NOTE... notifications. (See dlpi.h).
 	 */
-	uint32_t		ds_notifications;
-
-	/*
-	 * Cached MAC unicast addresses.
-	 */
-	uint8_t			ds_fact_addr[MAXMACADDRLEN];
-	uint8_t			ds_curr_addr[MAXMACADDRLEN];
+	uint32_t		ds_notifications;	/* SL */
 
 	/*
 	 * Mode: unitdata, fast-path or raw.
 	 */
-	dld_str_mode_t		ds_mode;
+	dld_str_mode_t		ds_mode;		/* SL */
 
 	/*
 	 * Native mode state.
 	 */
-	boolean_t		ds_native;
+	boolean_t		ds_native;		/* SL */
 
 	/*
 	 * IP polling is operational if this flag is set.
 	 */
-	boolean_t		ds_polling;
-	boolean_t		ds_soft_ring;
+	boolean_t		ds_polling;		/* SL */
+	boolean_t		ds_direct;		/* SL */
 
 	/*
 	 * LSO is enabled if ds_lso is set.
 	 */
-	boolean_t		ds_lso;
-	uint64_t		ds_lso_max;
+	boolean_t		ds_lso;			/* SL */
+	uint64_t		ds_lso_max;		/* SL */
 
 	/*
 	 * State of DLPI user: may be active (regular network layer),
 	 * passive (snoop-like monitoring), or unknown (not yet
 	 * determined).
 	 */
-	dld_passivestate_t	ds_passivestate;
+	dld_passivestate_t	ds_passivestate;	/* SL */
 
 	/*
 	 * Dummy mblk used for flow-control.
 	 */
-	mblk_t			*ds_tx_flow_mp;
-
-	/*
-	 * Internal transmit queue and its parameters.
-	 */
-	kmutex_t		ds_tx_list_lock;
-	mblk_t			*ds_tx_list_head;
-	mblk_t			*ds_tx_list_tail;
-	uint_t			ds_tx_cnt;
-	uint_t			ds_tx_msgcnt;
-	timeout_id_t		ds_tx_qdepth_tid;
-	boolean_t		ds_tx_qbusy;
-
-	dld_tx_t		ds_tx;
-	dld_tx_t		ds_unitdata_tx;
-	kmutex_t		ds_tx_lock;
-	kcondvar_t		ds_tx_cv;
-	uint32_t		ds_intx_cnt;
-	boolean_t		ds_detaching;
-
-	/*
-	 * Pending control messages to be processed.
-	 */
-	mblk_t			*ds_pending_head;
-	mblk_t			*ds_pending_tail;
-
-	taskqid_t		ds_tid;
-	kmutex_t		ds_disp_lock;
-	kcondvar_t		ds_disp_cv;
-	boolean_t		ds_closing;
+	mblk_t			*ds_tx_flow_mp;		/* ds_lock */
 
 	/*
-	 * Used to process ioctl message for control node. See comments
-	 * above dld_ioctl().
+	 * List of queued DLPI requests. These will be processed
+	 * by a taskq thread. This block is protected by ds_lock
 	 */
-	void			(*ds_ioctl)(queue_t *, mblk_t *);
+	kmutex_t		ds_lock;
+	krwlock_t		ds_rw_lock;
+	kcondvar_t		ds_datathr_cv;		/* ds_lock */
+	uint_t			ds_datathr_cnt;		/* ds_lock */
+	mblk_t			*ds_pending_head;	/* ds_lock */
+	mblk_t			*ds_pending_tail;	/* ds_lock */
+	kcondvar_t		ds_dlpi_pending_cv;	/* ds_lock */
+	uint32_t
+				ds_dlpi_pending : 1,	/* ds_lock */
+				ds_local	: 1,
+				ds_pad		: 30;	/* ds_lock */
+
+	dls_link_t		*ds_dlp;		/* SL */
+	dls_multicst_addr_t	*ds_dmap;		/* ds_rw_lock */
+	dls_rx_t		ds_rx;			/* ds_lock */
+	void			*ds_rx_arg;		/* ds_lock */
+	boolean_t		ds_active;		/* SL */
+	dld_str_t		*ds_next;		/* SL */
+	dls_head_t		*ds_head;
+	dls_dl_handle_t		ds_ddh;
+	list_node_t		ds_tqlist;
 };
 
-#define	DLD_TX_ENTER(dsp) {					\
-	mutex_enter(&(dsp)->ds_tx_lock);			\
-	(dsp)->ds_intx_cnt++;					\
-	mutex_exit(&(dsp)->ds_tx_lock);				\
-}
-
-#define	DLD_TX_EXIT(dsp) {					\
-	mutex_enter(&(dsp)->ds_tx_lock);			\
-	if ((--(dsp)->ds_intx_cnt == 0) && (dsp)->ds_detaching)	\
-		cv_signal(&(dsp)->ds_tx_cv);			\
-	mutex_exit(&(dsp)->ds_tx_lock);				\
+#define	DLD_DATATHR_INC(dsp)	{		\
+	ASSERT(MUTEX_HELD(&(dsp)->ds_lock));	\
+	dsp->ds_datathr_cnt++;			\
 }
 
-/*
- * Quiesce the traffic.
- */
-#define	DLD_TX_QUIESCE(dsp) {						\
-	mutex_enter(&(dsp)->ds_tx_lock);				\
-	(dsp)->ds_tx = (dsp)->ds_unitdata_tx = NULL;			\
-	(dsp)->ds_detaching = B_TRUE;					\
-	while ((dsp)->ds_intx_cnt != 0)					\
-		cv_wait(&(dsp)->ds_tx_cv, &(dsp)->ds_tx_lock);		\
-	(dsp)->ds_detaching = B_FALSE;					\
-	mutex_exit(&(dsp)->ds_tx_lock);					\
+#define	DLD_DATATHR_DCR(dsp)	{		\
+	mutex_enter(&(dsp)->ds_lock);		\
+	(dsp)->ds_datathr_cnt--;		\
+	if ((dsp)->ds_datathr_cnt == 0)		\
+		cv_broadcast(&(dsp)->ds_datathr_cv);	\
+	mutex_exit(&(dsp)->ds_lock);		\
 }
 
 /*
@@ -269,26 +244,34 @@ extern void		dld_str_rx_fastpath(void *, mac_resource_handle_t,
     mblk_t *, mac_header_info_t *);
 extern void		dld_str_rx_unitdata(void *, mac_resource_handle_t,
     mblk_t *, mac_header_info_t *);
-
-extern void		dld_tx_flush(dld_str_t *);
 extern void		dld_str_notify_ind(dld_str_t *);
-extern void		dld_tx_single(dld_str_t *, mblk_t *);
-extern void		str_mdata_fastpath_put(dld_str_t *, mblk_t *);
-extern void		str_mdata_raw_put(dld_str_t *, mblk_t *);
-
-extern void		dld_ioctl(queue_t *, mblk_t *);
-extern void		dld_finish_pending_task(dld_str_t *);
+extern mac_tx_cookie_t	str_mdata_fastpath_put(dld_str_t *, mblk_t *,
+    uintptr_t, uint16_t);
+extern int		dld_flow_ctl_callb(dld_str_t *, uint64_t,
+    int (*func)(), void *);
 
 /*
  * dld_proto.c
  */
-extern void		dld_wput_proto_nondata(dld_str_t *, mblk_t *);
-extern void		dld_wput_proto_data(dld_str_t *, mblk_t *);
+extern void		dld_proto(dld_str_t *, mblk_t *);
+extern void		dld_proto_unitdata_req(dld_str_t *, mblk_t *);
 extern void		dld_capabilities_disable(dld_str_t *);
+extern void		proto_unitdata_req(dld_str_t *, mblk_t *);
+
+/*
+ * dld_flow.c
+ */
+extern void		flow_rx_pkt_chain(void *, void *, mblk_t *);
+
+/*
+ * dld_drv.c
+ */
+extern mac_handle_t	dld_mac_open(char *dev_name, int *err);
+#define	dld_mac_close(mh) mac_close(mh)
 
 /*
  * Options: there should be a separate bit defined here for each
- *	  DLD_PROP... defined in dld.h.
+ *          DLD_PROP... defined in dld.h.
  */
 #define	DLD_OPT_NO_FASTPATH	0x00000001
 #define	DLD_OPT_NO_POLL		0x00000002
@@ -316,6 +299,33 @@ typedef struct dld_ap {
 
 #define	IMPLY(p, c)	(!(p) || (c))
 
+#define	DLD_SETQFULL(dsp) {						\
+	queue_t *q = (dsp)->ds_wq;					\
+									\
+	mutex_enter(&(dsp)->ds_lock);					\
+	if ((dsp)->ds_tx_flow_mp != NULL) {				\
+		(void) putq(q, (dsp)->ds_tx_flow_mp);			\
+		(dsp)->ds_tx_flow_mp = NULL;				\
+		qenable((dsp)->ds_wq);					\
+	}								\
+	mutex_exit(&(dsp)->ds_lock);					\
+}
+
+#define	DLD_CLRQFULL(dsp) {						\
+	queue_t *q = (dsp)->ds_wq;					\
+									\
+	mutex_enter(&(dsp)->ds_lock);					\
+	if (!mac_tx_is_flow_blocked((dsp)->ds_mch, NULL)) {		\
+		if ((dsp)->ds_tx_flow_mp == NULL)			\
+			(dsp)->ds_tx_flow_mp = getq(q);			\
+		ASSERT((dsp)->ds_tx_flow_mp != NULL);			\
+	}								\
+	mutex_exit(&(dsp)->ds_lock);					\
+}
+
+#define	DLD_TX(dsp, mp, f_hint, flag)					\
+	mac_tx(dsp->ds_mch, mp, f_hint, flag, NULL)
+
 #ifdef DEBUG
 #define	DLD_DBG		cmn_err
 #else
diff --git a/usr/src/uts/common/sys/dld_ioc.h b/usr/src/uts/common/sys/dld_ioc.h
index cb8f5bf225..86406cab4f 100644
--- a/usr/src/uts/common/sys/dld_ioc.h
+++ b/usr/src/uts/common/sys/dld_ioc.h
@@ -77,18 +77,22 @@ extern "C" {
  * DLDCOPYIN or DLDCOPYOUT flags are set so that every di_func()
  * callback function does not need to copyin/out its own data.
  */
-typedef int (dld_ioc_func_t)(void *, intptr_t, int, cred_t *);
+
+/* Maximum number of Privileges */
+#define	DLD_MAX_PRIV	16
+
+typedef int (dld_ioc_func_t)(void *, intptr_t, int, cred_t *, int *);
 typedef struct dld_ioc_info {
 	uint_t		di_cmd;
 	uint_t		di_flags;
 	size_t		di_argsize;
 	dld_ioc_func_t	*di_func;
+	const char	*di_priv[DLD_MAX_PRIV];
 } dld_ioc_info_t;
 
 /* Values for di_flags */
 #define	DLDCOPYIN	0x00000001 /* copyin di_argsize amount of data */
 #define	DLDCOPYOUT	0x00000002 /* copyout di_argsize amount of data */
-#define	DLDDLCONFIG	0x00000004 /* ioctl requires PRIV_SYS_DL_CONFIG */
 #define	DLDCOPYINOUT	(DLDCOPYIN | DLDCOPYOUT)
 
 #define	DLDIOCCNT(l)	(sizeof (l) / sizeof (dld_ioc_info_t))
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index 3af7b7bca7..aa01ddeed6 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -586,12 +586,8 @@ union	DL_qos_types {
 					/* dl_data is dl_capab_mdt_t */
 #define	DL_CAPAB_ZEROCOPY	0x05	/* Zero-copy capability */
 					/* dl_data is dl_capab_zerocopy_t */
-#define	DL_CAPAB_POLL		0x06	/* Polling capability */
-					/* dl_data is dl_capab_dls_t */
-#define	DL_CAPAB_SOFT_RING   	0x07	/* Soft ring capable */
-					/* dl_data is dl_capab_dls_t */
-#define	DL_CAPAB_LSO		0x08	/* Large Send Offload capability */
-					/* dl_data is dl_capab_lso_t */
+#define	DL_CAPAB_DLD		0x06	/* dld capability */
+					/* dl_data is dl_capab_dld_t */
 
 typedef struct {
 	t_uscalar_t	dl_cap;		/* capability type */
@@ -710,55 +706,22 @@ typedef struct {
 #ifdef _KERNEL
 
 /*
- * This structure is used by DL_CAPAB_POLL and DL_CAPAB_SOFT_RING
- * capabilities. It provides a mechanism for IP to exchange function
- * pointers with a gldv3-based driver to enable it to bypass streams-
- * data-paths. DL_CAPAB_POLL mechanism provides a way to blank
- * interrupts. Note: True polling support will be added in the future.
- * DL_CAPAB_SOFT_RING provides a mechanism to create soft ring at the
- * dls layer.
+ * The DL_CAPAB_DLD capability enables the capabilities of gldv3-based drivers
+ * to be negotiated using a function call (dld_capab) instead of using streams.
  */
-typedef struct dl_capab_dls_s {
-	t_uscalar_t		dls_version;
-	t_uscalar_t		dls_flags;
+typedef struct dl_capab_dld_s {
+	t_uscalar_t		dld_version;
+	t_uscalar_t		dld_flags;
 
 	/* DLD provided information */
-	uintptr_t		dls_tx_handle;
-	uintptr_t		dls_tx;
-	uintptr_t		dls_ring_change_status;
-	uintptr_t		dls_ring_bind;
-	uintptr_t		dls_ring_unbind;
+	uintptr_t		dld_capab;
+	uintptr_t		dld_capab_handle;
+	dl_mid_t		dld_mid;	/* module ID */
+} dl_capab_dld_t;
 
-	/* IP provided information */
-	uintptr_t		dls_rx_handle;
-	uintptr_t		dls_ring_assign;
-	uintptr_t		dls_rx;
-	uintptr_t		dls_ring_add;
-	t_uscalar_t		dls_ring_cnt;
-
-	dl_mid_t		dls_mid;		/* module ID */
-} dl_capab_dls_t;
-
-#define	POLL_CURRENT_VERSION	0x01
-#define	POLL_VERSION_1		0x01
-
-#define	SOFT_RING_VERSION_1		0x01
-
-/* Values for poll_flags */
-#define	POLL_ENABLE		0x01	/* Set to enable polling */
-					/* capability */
-#define	POLL_CAPABLE		0x02	/* Polling ability exists */
-#define	POLL_DISABLE		0x03	/* Disable Polling */
-
-/* Values for soft_ring_flags */
-#define	SOFT_RING_ENABLE		0x04	/* Set to enable soft_ring */
-						/* capability */
-#define	SOFT_RING_CAPABLE		0x05	/* Soft_Ring ability exists */
-#define	SOFT_RING_DISABLE		0x06	/* Disable Soft_Ring */
-
-/* Soft_Ring fanout types (used by soft_ring_change_status) */
-#define	SOFT_RING_NONE			0x00
-#define	SOFT_RING_FANOUT		0x01
+#define	DL_CAPAB_DLD_ENABLE	0x00000001
+#define	DLD_VERSION_1		1
+#define	DLD_CURRENT_VERSION	DLD_VERSION_1
 
 #endif /* _KERNEL */
 
@@ -786,29 +749,6 @@ typedef struct {
 						/* transmit */
 
 /*
- * Large Send Offload sub-capability (follows dl_capability_sub_t)
- */
-typedef struct {
-	t_uscalar_t	lso_version;		/* interface version */
-	t_uscalar_t	lso_flags;		/* capability flags */
-	t_uscalar_t	lso_max;		/* maximum payload */
-	t_uscalar_t	reserved[1];		/* reserved fields */
-	dl_mid_t	lso_mid;		/* module ID */
-} dl_capab_lso_t;
-
-/*
- * Large Send Offload revision definition history
- */
-#define	LSO_CURRENT_VERSION		0x01
-#define	LSO_VERSION_1			0x01
-
-/*
- * Currently supported values of lso_flags
- */
-#define	LSO_TX_ENABLE			0x01	/* to enable LSO */
-#define	LSO_TX_BASIC_TCP_IPV4		0x02	/* TCP LSO capability */
-
-/*
  * DLPI interface primitive definitions.
  *
  * Each primitive is sent as a stream message.  It is possible that
diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h
index 3bfe25ecf0..c96c6f1b85 100644
--- a/usr/src/uts/common/sys/dls.h
+++ b/usr/src/uts/common/sys/dls.h
@@ -28,8 +28,8 @@
 
 #include <sys/types.h>
 #include <sys/stream.h>
-#include <net/if.h>
-#include <sys/mac.h>
+#include <sys/mac_client.h>
+#include <sys/dls_mgmt.h>
 
 /*
  * Data-Link Services Module
@@ -53,233 +53,56 @@ extern "C" {
  * Macros for converting ppas to instance #s, Vlan ID, or minor.
  */
 #define	DLS_PPA2INST(ppa)	((int)((ppa) % 1000))
-#define	DLS_PPA2VID(ppa)	((ppa) / 1000)
+#define	DLS_PPA2VID(ppa)	((uint16_t)((ppa) / 1000))
+#define	DLS_PPA2MINOR(ppa)	((minor_t)((DLS_PPA2INST(ppa)) + 1))
 
 /*
- * Converts a minor to an instance#; makes sense only when minor <= 1000.
- */
-#define	DLS_MINOR2INST(minor)	((int)((minor) - 1))
-
-typedef enum {
-	DATALINK_CLASS_PHYS		= 0x01,
-	DATALINK_CLASS_VLAN		= 0x02,
-	DATALINK_CLASS_AGGR		= 0x04,
-	DATALINK_CLASS_VNIC		= 0x08
-} datalink_class_t;
-
-#define	DATALINK_CLASS_ALL	(DATALINK_CLASS_PHYS |	\
-	DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC)
-
-/*
- * A combination of flags and media.
- *   flags is the higher 32 bits, and if it is 0x01, it indicates all media
- *   types can be accepted; otherwise, only the given media type (specified
- *   in the lower 32 bits) is accepted.
+ * Maps a (VID, INST) pair to ppa
  */
-typedef uint64_t	datalink_media_t;
-
-#define	DATALINK_ANY_MEDIATYPE		\
-	((datalink_media_t)(((datalink_media_t)0x01) << 32))
-
-#define	DATALINK_MEDIA_ACCEPTED(dmedia, media)				\
-	(((uint32_t)(((dmedia) >> 32) & 0xfffffffful) & 0x01) ?		\
-	B_TRUE : ((uint32_t)((dmedia) & 0xfffffffful) == (media)))
-
-#define	MAXLINKATTRLEN		32
-#define	MAXLINKATTRVALLEN	1024
+#define	DLS_VIDINST2PPA(vid, inst)	((minor_t)((vid) * 1000 + (inst)))
 
 /*
- * Link attributes used by the kernel.
- */
-/*
- * The major number and instance number of the underlying physical device
- * are kept as FPHYMAJ and FPHYINST (major, instance + 1).
- *
- * Set for physical links only.
- */
-#define	FPHYMAJ		"phymaj"	/* uint64_t */
-#define	FPHYINST	"phyinst"	/* uint64_t */
-
-/*
- * The devname of the physical link. For example, bge0, ce1. Set for physical
- * links only.
- */
-#define	FDEVNAME	"devname"	/* string */
-
-/*
- * The door file for the dlmgmtd (data-link management) daemon.
- */
-#define	DLMGMT_DOOR	"/etc/svc/volatile/dladm/dlmgmt_door"
-
-/*
- * Door upcall commands.
- */
-#define	DLMGMT_CMD_DLS_CREATE		1
-#define	DLMGMT_CMD_DLS_GETATTR		2
-#define	DLMGMT_CMD_DLS_DESTROY		3
-#define	DLMGMT_CMD_GETNAME		4
-#define	DLMGMT_CMD_GETLINKID		5
-#define	DLMGMT_CMD_GETNEXT		6
-#define	DLMGMT_CMD_DLS_UPDATE		7
-#define	DLMGMT_CMD_LINKPROP_INIT	8
-#define	DLMGMT_CMD_BASE			128
-
-/*
- * Indicate the link mapping is active or persistent
- */
-#define	DLMGMT_ACTIVE		0x01
-#define	DLMGMT_PERSIST		0x02
-
-/* upcall argument */
-typedef struct dlmgmt_door_arg {
-	uint_t			ld_cmd;
-} dlmgmt_door_arg_t;
-
-typedef struct dlmgmt_upcall_arg_create {
-	int			ld_cmd;
-	datalink_class_t	ld_class;
-	uint32_t		ld_media;
-	boolean_t		ld_persist;
-	uint64_t		ld_phymaj;
-	uint64_t		ld_phyinst;
-	char			ld_devname[MAXNAMELEN];
-} dlmgmt_upcall_arg_create_t;
-
-/*
- * Note: ld_padding is necessary to keep the size of the structure the
- * same on amd64 and i386.  The same note applies to other ld_padding
- * and lr_paddding fields in structures throughout this file.
+ * Converts a minor to an instance#; makes sense only when minor <= 1000.
  */
-typedef struct dlmgmt_upcall_arg_destroy {
-	int			ld_cmd;
-	datalink_id_t		ld_linkid;
-	boolean_t		ld_persist;
-	int			ld_padding;
-} dlmgmt_upcall_arg_destroy_t;
-
-typedef struct dlmgmt_upcall_arg_update {
-	int			ld_cmd;
-	boolean_t		ld_novanity;
-	uint32_t		ld_media;
-	uint32_t		ld_padding;
-	char			ld_devname[MAXNAMELEN];
-} dlmgmt_upcall_arg_update_t;
-
-typedef struct dlmgmt_upcall_arg_getattr {
-	int			ld_cmd;
-	datalink_id_t		ld_linkid;
-	char			ld_attr[MAXLINKATTRLEN];
-} dlmgmt_upcall_arg_getattr_t;
-
-typedef struct dlmgmt_door_getname {
-	int			ld_cmd;
-	datalink_id_t		ld_linkid;
-} dlmgmt_door_getname_t;
-
-typedef struct dlmgmt_door_getlinkid {
-	int			ld_cmd;
-	char			ld_link[MAXLINKNAMELEN];
-} dlmgmt_door_getlinkid_t;
-
-typedef struct dlmgmt_door_getnext_s {
-	int			ld_cmd;
-	datalink_id_t		ld_linkid;
-	datalink_class_t	ld_class;
-	uint32_t		ld_flags;
-	datalink_media_t	ld_dmedia;
-} dlmgmt_door_getnext_t;
-
-typedef struct dlmgmt_door_linkprop_init {
-	int			ld_cmd;
-	datalink_id_t		ld_linkid;
-} dlmgmt_door_linkprop_init_t;
-
-/* upcall return value */
-typedef struct dlmgmt_retval_s {
-	uint_t			lr_err; /* return error code */
-} dlmgmt_retval_t;
-
-typedef dlmgmt_retval_t	dlmgmt_destroy_retval_t,
-			dlmgmt_linkprop_init_retval_t;
-
-struct dlmgmt_linkid_retval_s {
-	uint_t			lr_err;
-	datalink_id_t		lr_linkid;
-	uint32_t		lr_flags;
-	datalink_class_t	lr_class;
-	uint32_t		lr_media;
-	uint32_t		lr_padding;
-};
-
-typedef struct dlmgmt_linkid_retval_s	dlmgmt_create_retval_t,
-					dlmgmt_update_retval_t,
-					dlmgmt_getlinkid_retval_t,
-					dlmgmt_getnext_retval_t;
-
-typedef struct dlmgmt_getname_retval_s {
-	uint_t			lr_err;
-	char			lr_link[MAXLINKNAMELEN];
-	datalink_class_t	lr_class;
-	uint32_t		lr_media;
-	uint32_t		lr_flags;
-} dlmgmt_getname_retval_t;
-
-typedef struct dlmgmt_getattr_retval_s {
-	uint_t			lr_err;
-	uint_t			lr_type;
-	uint_t			lr_attrsz;
-	uint_t			lr_padding;
-	char			lr_attrval[MAXLINKATTRVALLEN];
-} dlmgmt_getattr_retval_t;
+#define	DLS_MINOR2INST(minor)	((int)((minor) - 1))
 
 #ifdef	_KERNEL
 
 #define	DLS_MAX_PPA	999
 #define	DLS_MAX_MINOR	(DLS_MAX_PPA + 1)
 
-typedef	struct dls_t		*dls_channel_t;
+typedef void    (*dls_rx_t)(void *, mac_resource_handle_t, mblk_t *,
+		    mac_header_info_t *);
 
-extern int		dls_open_style2_vlan(major_t, uint_t, dls_channel_t *);
-extern int		dls_open_by_dev(dev_t, dls_channel_t *);
-extern void		dls_close(dls_channel_t);
-
-extern mac_handle_t	dls_mac(dls_channel_t);
-extern uint16_t		dls_vid(dls_channel_t);
+typedef struct dld_str_s	dld_str_t;
+typedef struct dls_devnet_s	*dls_dl_handle_t;
+typedef struct dls_dev_t	*dls_dev_handle_t;
+typedef struct dls_link_s	dls_link_t;
 
 #define	DLS_SAP_LLC	0
 #define	DLS_SAP_PROMISC	(1 << 16)
 
-extern int		dls_bind(dls_channel_t, uint32_t);
-extern void		dls_unbind(dls_channel_t);
-
 #define	DLS_PROMISC_SAP		0x00000001
 #define	DLS_PROMISC_MULTI	0x00000002
 #define	DLS_PROMISC_PHYS	0x00000004
 
-extern int		dls_promisc(dls_channel_t, uint32_t);
-
-extern int		dls_multicst_add(dls_channel_t, const uint8_t *);
-extern int		dls_multicst_remove(dls_channel_t, const uint8_t *);
-
-extern mblk_t		*dls_header(dls_channel_t, const uint8_t *,
-			    uint16_t, uint_t, mblk_t **);
-extern int		dls_header_info(dls_channel_t, mblk_t *,
-			    mac_header_info_t *);
+extern int	dls_open(dls_link_t *, dls_dl_handle_t, dld_str_t *);
+extern void	dls_close(dld_str_t *);
+extern int	dls_bind(dld_str_t *, uint32_t);
+extern int	dls_unbind(dld_str_t *);
 
-typedef	void		(*dls_rx_t)(void *, mac_resource_handle_t, mblk_t *,
-			    mac_header_info_t *);
+extern int	dls_promisc(dld_str_t *, uint32_t);
 
-extern void		dls_rx_set(dls_channel_t, dls_rx_t, void *);
+extern int	dls_multicst_add(dld_str_t *, const uint8_t *);
+extern int	dls_multicst_remove(dld_str_t *, const uint8_t *);
 
-extern mblk_t		*dls_tx(dls_channel_t, mblk_t *);
+extern mblk_t	*dls_header(dld_str_t *, const uint8_t *,
+		    uint16_t, uint_t, mblk_t **);
 
-extern boolean_t	dls_active_set(dls_channel_t);
-extern void		dls_active_clear(dls_channel_t);
+extern void	dls_rx_set(dld_str_t *, dls_rx_t, void *);
+extern dld_str_t *dls_rx_get(char *, flow_desc_t *, size_t *);
 
-extern dev_info_t	*dls_finddevinfo(dev_t);
-
-typedef struct dls_devnet_s	*dls_dl_handle_t;
-typedef struct dls_dev_t	*dls_dev_handle_t;
+extern void	str_notify(void *, mac_notify_type_t);
 
 extern int		dls_devnet_open(const char *,
 			    dls_dl_handle_t *, dev_t *);
@@ -289,19 +112,18 @@ extern boolean_t	dls_devnet_rebuild();
 extern int		dls_devnet_rename(datalink_id_t, datalink_id_t,
 			    const char *);
 extern int		dls_devnet_create(mac_handle_t, datalink_id_t);
-extern int		dls_devnet_destroy(mac_handle_t, datalink_id_t *);
+extern int		dls_devnet_destroy(mac_handle_t, datalink_id_t *,
+			    boolean_t);
 extern int		dls_devnet_recreate(mac_handle_t, datalink_id_t);
-extern int		dls_devnet_create_vlan(datalink_id_t,
-			    datalink_id_t, uint16_t, boolean_t);
-extern int		dls_devnet_destroy_vlan(datalink_id_t);
 extern int		dls_devnet_hold_tmp(datalink_id_t, dls_dl_handle_t *);
 extern void		dls_devnet_rele_tmp(dls_dl_handle_t);
+extern int		dls_devnet_hold_by_dev(dev_t, dls_dl_handle_t *);
+extern void		dls_devnet_rele(dls_dl_handle_t);
 extern void		dls_devnet_prop_task_wait(dls_dl_handle_t);
 
 extern const char	*dls_devnet_mac(dls_dl_handle_t);
 extern uint16_t		dls_devnet_vid(dls_dl_handle_t);
 extern datalink_id_t	dls_devnet_linkid(dls_dl_handle_t);
-extern boolean_t	dls_devnet_is_explicit(dls_dl_handle_t);
 extern int		dls_devnet_dev2linkid(dev_t, datalink_id_t *);
 extern int		dls_devnet_phydev(datalink_id_t, dev_t *);
 extern int		dls_devnet_setzid(const char *, zoneid_t);
@@ -318,6 +140,8 @@ extern int		dls_mgmt_get_linkinfo(datalink_id_t, char *,
 extern int		dls_mgmt_get_linkid(const char *, datalink_id_t *);
 extern datalink_id_t	dls_mgmt_get_next(datalink_id_t, datalink_class_t,
 			    datalink_media_t, uint32_t);
+extern int		dls_devnet_macname2linkid(const char *,
+			    datalink_id_t *);
 
 #endif	/* _KERNEL */
 
diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h
index 83bccd20bb..71f79a611a 100644
--- a/usr/src/uts/common/sys/dls_impl.h
+++ b/usr/src/uts/common/sys/dls_impl.h
@@ -26,174 +26,97 @@
 #ifndef	_SYS_DLS_IMPL_H
 #define	_SYS_DLS_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/stream.h>
 #include <sys/dls.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
 #include <sys/modhash.h>
 #include <sys/kstat.h>
 #include <net/if.h>
 #include <sys/dlpi.h>
-#include <sys/dls_soft_ring.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
-typedef struct dls_multicst_addr_s	dls_multicst_addr_t;
-
-struct dls_multicst_addr_s {
-	dls_multicst_addr_t	*dma_nextp;
-	uint8_t			dma_addr[MAXMACADDRLEN];
-};
-
-typedef	struct dls_link_s	dls_link_t;
-
-struct dls_link_s {
-	char			dl_name[MAXNAMELEN];
-	mac_handle_t		dl_mh;
-	const mac_info_t	*dl_mip;
-	mac_rx_handle_t		dl_mrh;
-	mac_txloop_handle_t	dl_mth;
-	uint_t			dl_ref;
-	uint_t			dl_macref;
-	mod_hash_t		*dl_impl_hash;
-	krwlock_t		dl_impl_lock;
-	uint_t			dl_impl_count;
-	kmutex_t		dl_promisc_lock;
-	uint_t			dl_npromisc;
-	uint_t			dl_nactive;
-	uint32_t		dl_unknowns;
-	kmutex_t		dl_lock;
+typedef struct dls_multicst_addr_s {
+	struct dls_multicst_addr_s	*dma_nextp;		/* ds_rw_lock */
+	uint8_t				dma_addr[MAXMACADDRLEN];
+} dls_multicst_addr_t;
+
+struct dls_link_s {				/* Protected by */
+	char			dl_name[MAXNAMELEN];	/* SL */
+	uint_t			dl_ddi_instance;	/* SL */
+	mac_handle_t		dl_mh;			/* SL */
+	mac_client_handle_t	dl_mch;			/* SL */
+	mac_unicast_handle_t	dl_mah;			/* SL */
+	const mac_info_t	*dl_mip;		/* SL */
+	uint_t			dl_ref;			/* SL */
+	mod_hash_t		*dl_str_hash;		/* SL, modhash lock */
+	uint_t			dl_impl_count;		/* SL */
+	uint_t			dl_nactive;		/* SL */
+	uint32_t		dl_unknowns;		/* atomic */
+	zoneid_t		dl_zid;
+	uint_t			dl_zone_ref;
 };
 
-typedef struct dls_impl_s dls_impl_t;
-typedef struct dls_head_s dls_head_t;
-
-/*
- * The maximum length of an SPA (subnetwork point of attachment).  It is of
- * the form <macname/vid>.
- */
-#define	MAXSPALEN		(MAXNAMELEN + 5)
-
-typedef struct dls_vlan_s {
-	/*
-	 * The following fields will not change after dls_vlan_t creation.
-	 */
-	dls_link_t		*dv_dlp;
-	uint16_t		dv_id;
-
-	/*
-	 * Unique SPA (of the form <macname/vid>) identifying a data-link;
-	 * is needed to avoid name collisions between an explicitly and
-	 * implicitly created VLANs.
-	 */
-	char			dv_spa[MAXSPALEN];
-
-	/*
-	 * The ppa value of the associated device. Used to derive this link's
-	 * devfs node name.
-	 */
-	uint_t			dv_ppa;
-
-	/*
-	 * The dev_t used to access this dls_vlan_t.
-	 */
-	dev_t			dv_dev;
-
-	dev_info_t		*dv_dip;
-	kstat_t			*dv_ksp;
-	uint32_t		dv_force : 1;
-
-	/*
-	 * The following fields are protected by dv_lock.
-	 */
-	kmutex_t		dv_lock;
-
-	/*
-	 * Reference count of dls_impl_t plus explicit creation of the link
-	 */
-	uint_t			dv_ref;
-
-	/*
-	 * The reference count of this vlan is opened in its own zone.
-	 */
-	uint_t			dv_zone_ref;
-	zoneid_t		dv_zid;
-} dls_vlan_t;
-
-struct dls_impl_s {
-	dls_impl_t			*di_nextp;
-	dls_head_t			*di_headp;
-	dls_vlan_t			*di_dvp;
-	mac_handle_t			di_mh;
-	mac_notify_handle_t		di_mnh;
-	const mac_info_t		*di_mip;
-	krwlock_t			di_lock;
-	uint16_t			di_sap;
-	uint_t				di_promisc;
-	dls_multicst_addr_t		*di_dmap;
-	dls_rx_t			di_rx;
-	void				*di_rx_arg;
-	mac_resource_add_t		di_ring_add;
-	const mac_txinfo_t		*di_txinfo;
-	uint_t				di_bound : 1,
-					di_removing : 1,
-					di_active : 1,
-					di_local : 1;
-
-	uint8_t				di_unicst_addr[MAXMACADDRLEN];
-	soft_ring_t			**di_soft_ring_list;
-	uint_t				di_soft_ring_size;
-	dls_dl_handle_t			di_ddh;
-};
-
-struct dls_head_s {
-	dls_impl_t			*dh_list;
-	uint_t				dh_ref;
-	mod_hash_key_t			dh_key;
-};
+typedef struct dls_head_s {
+	kmutex_t		dh_lock;
+	struct dld_str_s	*dh_list;		/* dh_ref */
+	uint_t			dh_ref;			/* dh_lock */
+	mod_hash_key_t		dh_key;			/* SL */
+	kcondvar_t		dh_cv;			/* dh_lock */
+	uint_t			dh_removing;		/* dh_lock */
+} dls_head_t;
 
 extern void		dls_link_init(void);
 extern int		dls_link_fini(void);
 extern int		dls_link_hold(const char *, dls_link_t **);
+extern int		dls_link_hold_create(const char *, dls_link_t **);
+extern int		dls_link_hold_by_dev(dev_t, dls_link_t **);
 extern void		dls_link_rele(dls_link_t *);
-extern void		dls_link_add(dls_link_t *, uint32_t, dls_impl_t *);
-extern void		dls_link_remove(dls_link_t *, dls_impl_t *);
+extern int		dls_link_rele_by_name(const char *);
+extern void		dls_link_add(dls_link_t *, uint32_t, dld_str_t *);
+extern void		dls_link_remove(dls_link_t *, dld_str_t *);
 extern int		dls_link_header_info(dls_link_t *, mblk_t *,
 			    mac_header_info_t *);
-extern int		dls_mac_hold(dls_link_t *);
-extern void		dls_mac_rele(dls_link_t *);
-extern boolean_t	dls_mac_active_set(dls_link_t *);
-extern void		dls_mac_active_clear(dls_link_t *);
+extern int		dls_link_setzid(const char *, zoneid_t);
+extern dev_info_t	*dls_link_devinfo(dev_t);
+extern dev_t		dls_link_dev(dls_link_t *);
 
-extern void		dls_mac_stat_create(dls_vlan_t *);
-extern void		dls_mac_stat_destroy(dls_vlan_t *);
+extern void		i_dls_head_rele(dls_head_t *);
+extern int		dls_mac_active_set(dls_link_t *i);
+extern void		dls_mac_active_clear(dls_link_t *);
 
-extern void		dls_vlan_init(void);
-extern int		dls_vlan_fini(void);
-extern int		dls_vlan_hold(const char *, uint16_t, dls_vlan_t **,
-			    boolean_t, boolean_t);
-extern int		dls_vlan_hold_by_dev(dev_t, dls_vlan_t **);
-extern void		dls_vlan_rele(dls_vlan_t *);
-extern int		dls_vlan_destroy(const char *, uint16_t);
-extern int		dls_vlan_create(const char *, uint16_t, boolean_t);
-extern int		dls_vlan_setzid(const char *, uint16_t, zoneid_t);
-extern int		dls_stat_update(kstat_t *, dls_vlan_t *, int);
+extern void		dls_create_str_kstats(dld_str_t *);
+extern int		dls_stat_update(kstat_t *, dls_link_t *, int);
 extern int		dls_stat_create(const char *, int, const char *,
 			    int (*)(struct kstat *, int), void *, kstat_t **);
 
-extern int		dls_devnet_open_by_dev(dev_t, dls_vlan_t **,
+extern int		dls_devnet_open_by_dev(dev_t, dls_link_t **,
 			    dls_dl_handle_t *);
+extern int		dls_devnet_hold_link(datalink_id_t, dls_dl_handle_t *,
+			    dls_link_t **);
+extern void		dls_devnet_rele_link(dls_dl_handle_t, dls_link_t *);
 
 extern void		dls_init(void);
 extern int		dls_fini(void);
 extern void		dls_link_txloop(void *, mblk_t *);
-extern boolean_t	dls_accept(dls_impl_t *, mac_header_info_t *,
+extern boolean_t	dls_accept(dld_str_t *, mac_header_info_t *,
 			    dls_rx_t *, void **);
-extern boolean_t	dls_accept_loopback(dls_impl_t *, mac_header_info_t *,
+extern boolean_t	dls_accept_loopback(dld_str_t *, mac_header_info_t *,
 			    dls_rx_t *, void **);
+extern boolean_t	dls_accept_promisc(dld_str_t *, mac_header_info_t *,
+			    dls_rx_t *, void **, boolean_t);
+extern void		i_dls_link_rx(void *, mac_resource_handle_t, mblk_t *,
+			    boolean_t);
+extern void		dls_rx_promisc(void *, mac_resource_handle_t, mblk_t *,
+			    boolean_t);
+extern void		dls_rx_vlan_promisc(void *, mac_resource_handle_t,
+			    mblk_t *, boolean_t);
+extern int		dls_active_set(dld_str_t *);
+extern void		dls_active_clear(dld_str_t *);
 
 extern void		dls_mgmt_init(void);
 extern void		dls_mgmt_fini(void);
diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h
new file mode 100644
index 0000000000..5177de09b9
--- /dev/null
+++ b/usr/src/uts/common/sys/dls_mgmt.h
@@ -0,0 +1,218 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_DLS_MGMT_H
+#define	_DLS_MGMT_H
+
+#include <sys/types.h>
+#include <sys/dld.h>
+
+/*
+ * Data-Link Services Module
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	DATALINK_CLASS_PHYS		= 0x01,
+	DATALINK_CLASS_VLAN		= 0x02,
+	DATALINK_CLASS_AGGR		= 0x04,
+	DATALINK_CLASS_VNIC		= 0x08,
+	DATALINK_CLASS_ETHERSTUB	= 0x10
+} datalink_class_t;
+
+#define	DATALINK_CLASS_ALL	(DATALINK_CLASS_PHYS |	\
+	DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC | \
+	DATALINK_CLASS_ETHERSTUB)
+
+/*
+ * A combination of flags and media.
+ *   flags is the higher 32 bits, and if it is 0x01, it indicates all media
+ *   types can be accepted; otherwise, only the given media type (specified
+ *   in the lower 32 bits) is accepted.
+ */
+typedef uint64_t	datalink_media_t;
+
+#define	DATALINK_ANY_MEDIATYPE		\
+	((datalink_media_t)(((datalink_media_t)0x01) << 32))
+
+#define	DATALINK_MEDIA_ACCEPTED(dmedia, media)				\
+	(((uint32_t)(((dmedia) >> 32) & 0xfffffffful) & 0x01) ?		\
+	B_TRUE : ((uint32_t)((dmedia) & 0xfffffffful) == (media)))
+
+#define	MAXLINKATTRLEN		32
+#define	MAXLINKATTRVALLEN	1024
+
+/*
+ * Link attributes used by the kernel.
+ */
+/*
+ * The major number and instance number of the underlying physical device
+ * are kept as FPHYMAJ and FPHYINST (major, instance + 1).
+ *
+ * Set for physical links only.
+ */
+#define	FPHYMAJ		"phymaj"	/* uint64_t */
+#define	FPHYINST	"phyinst"	/* uint64_t */
+
+/*
+ * The devname of the physical link. For example, bge0, ce1. Set for physical
+ * links only.
+ */
+#define	FDEVNAME	"devname"	/* string */
+
+/*
+ * The door file for the dlmgmtd (data-link management) daemon.
+ */
+#define	DLMGMT_DOOR	"/etc/svc/volatile/dladm/dlmgmt_door"
+
+/*
+ * Door upcall commands.
+ */
+#define	DLMGMT_CMD_DLS_CREATE		1
+#define	DLMGMT_CMD_DLS_GETATTR		2
+#define	DLMGMT_CMD_DLS_DESTROY		3
+#define	DLMGMT_CMD_GETNAME		4
+#define	DLMGMT_CMD_GETLINKID		5
+#define	DLMGMT_CMD_GETNEXT		6
+#define	DLMGMT_CMD_DLS_UPDATE		7
+#define	DLMGMT_CMD_LINKPROP_INIT	8
+#define	DLMGMT_CMD_BASE			128
+
+/*
+ * Indicate the link mapping is active or persistent
+ */
+#define	DLMGMT_ACTIVE		0x01
+#define	DLMGMT_PERSIST		0x02
+
+/* upcall argument */
+typedef struct dlmgmt_door_arg {
+	uint_t			ld_cmd;
+} dlmgmt_door_arg_t;
+
+typedef struct dlmgmt_upcall_arg_create {
+	int			ld_cmd;
+	datalink_class_t	ld_class;
+	uint32_t		ld_media;
+	boolean_t		ld_persist;
+	uint64_t		ld_phymaj;
+	uint64_t		ld_phyinst;
+	char			ld_devname[MAXNAMELEN];
+} dlmgmt_upcall_arg_create_t;
+
+/*
+ * Note: ld_padding is necessary to keep the size of the structure the
+ * same on amd64 and i386.  The same note applies to other ld_padding
+ * and lr_paddding fields in structures throughout this file.
+ */
+typedef struct dlmgmt_upcall_arg_destroy {
+	int			ld_cmd;
+	datalink_id_t		ld_linkid;
+	boolean_t		ld_persist;
+	int			ld_padding;
+} dlmgmt_upcall_arg_destroy_t;
+
+typedef struct dlmgmt_upcall_arg_update {
+	int			ld_cmd;
+	boolean_t		ld_novanity;
+	uint32_t		ld_media;
+	uint32_t		ld_padding;
+	char			ld_devname[MAXNAMELEN];
+} dlmgmt_upcall_arg_update_t;
+
+typedef struct dlmgmt_upcall_arg_getattr {
+	int			ld_cmd;
+	datalink_id_t		ld_linkid;
+	char			ld_attr[MAXLINKATTRLEN];
+} dlmgmt_upcall_arg_getattr_t;
+
+typedef struct dlmgmt_door_getname {
+	int			ld_cmd;
+	datalink_id_t		ld_linkid;
+} dlmgmt_door_getname_t;
+
+typedef struct dlmgmt_door_getlinkid {
+	int			ld_cmd;
+	char			ld_link[MAXLINKNAMELEN];
+} dlmgmt_door_getlinkid_t;
+
+typedef struct dlmgmt_door_getnext_s {
+	int			ld_cmd;
+	datalink_id_t		ld_linkid;
+	datalink_class_t	ld_class;
+	uint32_t		ld_flags;
+	datalink_media_t	ld_dmedia;
+} dlmgmt_door_getnext_t;
+
+typedef struct dlmgmt_door_linkprop_init {
+	int			ld_cmd;
+	datalink_id_t		ld_linkid;
+} dlmgmt_door_linkprop_init_t;
+
+/* upcall return value */
+typedef struct dlmgmt_retval_s {
+	uint_t			lr_err; /* return error code */
+} dlmgmt_retval_t;
+
+typedef dlmgmt_retval_t	dlmgmt_destroy_retval_t,
+			dlmgmt_linkprop_init_retval_t;
+
+struct dlmgmt_linkid_retval_s {
+	uint_t			lr_err;
+	datalink_id_t		lr_linkid;
+	uint32_t		lr_flags;
+	datalink_class_t	lr_class;
+	uint32_t		lr_media;
+	uint32_t		lr_padding;
+};
+
+typedef struct dlmgmt_linkid_retval_s	dlmgmt_create_retval_t,
+					dlmgmt_update_retval_t,
+					dlmgmt_getlinkid_retval_t,
+					dlmgmt_getnext_retval_t;
+
+typedef struct dlmgmt_getname_retval_s {
+	uint_t			lr_err;
+	char			lr_link[MAXLINKNAMELEN];
+	datalink_class_t	lr_class;
+	uint32_t		lr_media;
+	uint32_t		lr_flags;
+} dlmgmt_getname_retval_t;
+
+typedef struct dlmgmt_getattr_retval_s {
+	uint_t			lr_err;
+	uint_t			lr_type;
+	uint_t			lr_attrsz;
+	uint_t			lr_padding;
+	char			lr_attrval[MAXLINKATTRVALLEN];
+} dlmgmt_getattr_retval_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _DLS_MGMT_H */
diff --git a/usr/src/uts/common/sys/dls_soft_ring.h b/usr/src/uts/common/sys/dls_soft_ring.h
deleted file mode 100644
index 403623853a..0000000000
--- a/usr/src/uts/common/sys/dls_soft_ring.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_DLS_SOFT_RING_H
-#define	_SYS_DLS_SOFT_RING_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#include <sys/types.h>
-#include <sys/processor.h>
-#include <sys/stream.h>
-#include <sys/squeue.h>
-#include <sys/mac.h>
-
-#define	S_RING_NAMELEN 64
-
-typedef void (*s_ring_proc_t)(void *, void *, mblk_t *, mac_header_info_t *);
-
-typedef struct soft_ring_s {
-	/* Keep the most used members 64bytes cache aligned */
-	kmutex_t	s_ring_lock;	/* lock before using any member */
-	uint16_t	s_ring_type;	/* processing model of the sq */
-	uint16_t	s_ring_state;	/* state flags and message count */
-	int		s_ring_count;	/* # of mblocks in soft_ring */
-	mblk_t		*s_ring_first;	/* first mblk chain or NULL */
-	mblk_t		*s_ring_last;	/* last mblk chain or NULL */
-	s_ring_proc_t	s_ring_upcall;	/* Upcall func pointer */
-	void		*s_ring_upcall_arg1; /* upcall argument 1 */
-	void		*s_ring_upcall_arg2; /* upcall argument 2 */
-	clock_t		s_ring_awaken;	/* time async thread was awakened */
-
-	kthread_t	*s_ring_run;	/* Current thread processing sq */
-	processorid_t	s_ring_bind;	/* processor to bind to */
-	kcondvar_t	s_ring_async;	/* async thread blocks on */
-	clock_t		s_ring_wait;	/* lbolts to wait after a fill() */
-	timeout_id_t	s_ring_tid;	/* timer id of pending timeout() */
-	kthread_t	*s_ring_worker;	/* kernel thread id */
-	char		s_ring_name[S_RING_NAMELEN + 1];
-	uint32_t	s_ring_total_inpkt;
-} soft_ring_t;
-
-
-/*
- * type flags - combination allowed to process and drain the queue
- */
-#define	S_RING_WORKER_ONLY  	0x0001	/* Worker thread only */
-#define	S_RING_ANY		0x0002	/* Any thread can process the queue */
-
-/*
- * State flags.
- */
-#define	S_RING_PROC	0x0001	/* being processed */
-#define	S_RING_WORKER	0x0002	/* worker thread */
-#define	S_RING_BOUND	0x0004	/* Worker thread is bound */
-#define	S_RING_DESTROY	0x0008	/* Ring is being destroyed */
-#define	S_RING_DEAD		0x0010	/* Worker thread is no more */
-
-/*
- * arguments for processors to bind to
- */
-#define	S_RING_BIND_NONE	-1
-
-/*
- * Structure for dls statistics
- */
-struct dls_kstats {
-	kstat_named_t	dlss_soft_ring_pkt_drop;
-};
-
-extern struct dls_kstats dls_kstat;
-
-#define	DLS_BUMP_STAT(x, y)	(dls_kstat.x.value.ui32 += y)
-
-extern void soft_ring_init(void);
-extern soft_ring_t *soft_ring_create(char *, processorid_t, clock_t,
-    uint_t, pri_t);
-extern soft_ring_t **soft_ring_set_create(char *, processorid_t, clock_t,
-    uint_t, pri_t, int);
-extern void soft_ring_set_destroy(soft_ring_t **, int);
-extern void soft_ring_bind(void *, processorid_t);
-extern void soft_ring_unbind(void *);
-extern void dls_soft_ring_fanout(void *, void *, mblk_t *, mac_header_info_t *);
-extern boolean_t dls_soft_ring_enable(dls_channel_t, dl_capab_dls_t *);
-extern void dls_soft_ring_disable(dls_channel_t);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DLS_SOFT_RING_H */
diff --git a/usr/src/uts/common/sys/exacct.h b/usr/src/uts/common/sys/exacct.h
index b30362bb05..a9c394bb4f 100644
--- a/usr/src/uts/common/sys/exacct.h
+++ b/usr/src/uts/common/sys/exacct.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_EXACCT_H
 #define	_SYS_EXACCT_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/task.h>
 #include <sys/proc.h>
@@ -175,6 +173,7 @@ extern int exacct_tag_task(ac_info_t *, task_t *, void *, size_t, int);
 extern int exacct_tag_proc(ac_info_t *, pid_t, taskid_t, void *, size_t, int,
     const char *);
 extern void exacct_commit_flow(void *);
+extern int exacct_commit_netinfo(void *, int);
 extern void exacct_init(void);
 extern void *exacct_create_header(size_t *);
 extern int exacct_write_header(ac_info_t *, void *, size_t);
@@ -192,6 +191,9 @@ extern int exacct_assemble_flow_usage(ac_info_t *, flow_usage_t *,
     int (*)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
     void *, size_t, size_t *);
 extern void exacct_move_mstate(proc_t *, task_t *, task_t *);
+extern int exacct_assemble_net_usage(ac_info_t *, void *,
+    int (*)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
+    void *, size_t, size_t *, int);
 extern taskq_t *exacct_queue;
 extern kmem_cache_t *exacct_object_cache;
 #endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/exacct_catalog.h b/usr/src/uts/common/sys/exacct_catalog.h
index 0911344382..f6d9c09e7a 100644
--- a/usr/src/uts/common/sys/exacct_catalog.h
+++ b/usr/src/uts/common/sys/exacct_catalog.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_EXACCT_CATALOG_H
 #define	_SYS_EXACCT_CATALOG_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -104,6 +101,10 @@ extern "C" {
 #define	EXD_GROUP_FLOW		0x000109
 #define	EXD_GROUP_RFMA		0x00010a
 #define	EXD_GROUP_FMA		0x00010b
+#define	EXD_GROUP_NET_LINK_DESC	0X00010c
+#define	EXD_GROUP_NET_FLOW_DESC	0X00010d
+#define	EXD_GROUP_NET_LINK_STATS	0X00010e
+#define	EXD_GROUP_NET_FLOW_STATS	0X00010f
 
 #define	EXD_PROC_PID		0x001000
 #define	EXD_PROC_UID		0x001001
@@ -204,6 +205,36 @@ extern "C" {
 #define	EXD_FMA_OFFSET		0x00400B
 #define	EXD_FMA_UUID		0x00400C
 
+/* For EXD_GROUP_FLDESC  and EXD_GROUP_LNDESC */
+#define	EXD_NET_DESC_NAME	0x005001
+#define	EXD_NET_DESC_EHOST	0x005002
+#define	EXD_NET_DESC_EDEST	0x005003
+#define	EXD_NET_DESC_VLAN_TPID	0x005004
+#define	EXD_NET_DESC_VLAN_TCI	0x005005
+#define	EXD_NET_DESC_SAP	0x005006
+#define	EXD_NET_DESC_PRIORITY	0x005007
+#define	EXD_NET_DESC_BWLIMIT	0x005008
+/* For EXD_GROUP_FLDESC  only */
+#define	EXD_NET_DESC_DEVNAME	0x005009
+#define	EXD_NET_DESC_V4SADDR	0x00500a
+#define	EXD_NET_DESC_V4DADDR	0x00500b
+#define	EXD_NET_DESC_V6SADDR	0x00500c
+#define	EXD_NET_DESC_V6DADDR	0x00500d
+#define	EXD_NET_DESC_SPORT	0x00500e
+#define	EXD_NET_DESC_DPORT	0x00500f
+#define	EXD_NET_DESC_PROTOCOL	0x005010
+#define	EXD_NET_DESC_DSFIELD	0x005011
+
+/* For EXD_NET_STATS */
+#define	EXD_NET_STATS_NAME	0x006000
+#define	EXD_NET_STATS_CURTIME	0x006001
+#define	EXD_NET_STATS_IBYTES	0x006002
+#define	EXD_NET_STATS_OBYTES	0x006003
+#define	EXD_NET_STATS_IPKTS	0x006004
+#define	EXD_NET_STATS_OPKTS	0x006005
+#define	EXD_NET_STATS_IERRPKTS	0x006006
+#define	EXD_NET_STATS_OERRPKTS	0x006007
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/sys/exacct_impl.h b/usr/src/uts/common/sys/exacct_impl.h
index 14cee43d5f..6f25f02e7e 100644
--- a/usr/src/uts/common/sys/exacct_impl.h
+++ b/usr/src/uts/common/sys/exacct_impl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_EXACCT_IMPL_H
 #define	_SYS_EXACCT_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -129,6 +126,42 @@ typedef struct flow_usage {
 	char *fu_aname;		/* action instance name */
 } flow_usage_t;
 
+#define	EX_NET_LNDESC_REC 1
+#define	EX_NET_FLDESC_REC 2
+#define	EX_NET_LNSTAT_REC 3
+#define	EX_NET_FLSTAT_REC 4
+
+typedef struct net_stat_s {
+	char		*ns_name;
+	uint64_t	ns_ibytes;
+	uint64_t	ns_obytes;
+	uint64_t	ns_ipackets;
+	uint64_t	ns_opackets;
+	uint64_t	ns_ierrors;
+	uint64_t	ns_oerrors;
+	boolean_t	ns_isref;
+} net_stat_t;
+
+typedef struct net_desc_s {
+	char		*nd_name;
+	char		*nd_devname;
+	uchar_t		nd_ehost[6];
+	uchar_t		nd_edest[6];
+	ushort_t	nd_vlan_tpid;
+	ushort_t	nd_vlan_tci;
+	ushort_t	nd_sap;
+	ushort_t	nd_priority;
+	uint64_t	nd_bw_limit;
+	uint32_t	nd_saddr[4];
+	uint32_t	nd_daddr[4];
+	boolean_t	nd_isv4;
+	uint16_t	nd_sport;
+	uint16_t	nd_dport;
+	uint8_t		nd_protocol;
+	uint8_t		nd_dsfield;
+	int		nd_type;
+} net_desc_t;
+
 extern void exacct_order16(uint16_t *);
 extern void exacct_order32(uint32_t *);
 extern void exacct_order64(uint64_t *);
diff --git a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h
index 8cdf2cf96a..73419866a9 100644
--- a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h
+++ b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h
@@ -26,8 +26,6 @@
 #ifndef _SYS_IB_CLIENTS_IBD_H
 #define	_SYS_IB_CLIENTS_IBD_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -96,7 +94,7 @@ typedef struct ipoib_pgrh {
 #include <sys/ib/ibtl/ibti.h>
 #include <sys/ib/ib_pkt_hdrs.h>
 #include <sys/list.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ib.h>
 #include <sys/modhash.h>
 
diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h
index 9011423727..d4608f3729 100644
--- a/usr/src/uts/common/sys/mac.h
+++ b/usr/src/uts/common/sys/mac.h
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -30,6 +31,7 @@
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 #include <sys/stream.h>
+#include <sys/mac_flow.h>
 
 /*
  * MAC Services Module
@@ -42,13 +44,7 @@ extern "C" {
 /*
  * MAC Information (text emitted by modinfo(1m))
  */
-#define	MAC_INFO	"MAC Services"
-
-/*
- * MAC version identifier.  This is used by mac_alloc() mac_register() to
- * verify that incompatible drivers don't register.
- */
-#define	MAC_VERSION	0x1
+#define	MAC_INFO	"MAC Services v1.20"
 
 /*
  * MAC-Type version identifier.  This is used by mactype_alloc() and
@@ -58,17 +54,23 @@ extern "C" {
 #define	MACTYPE_VERSION	0x1
 
 /*
- * Statistics
+ * Opaque handle types
  */
+typedef struct __mac_handle		*mac_handle_t;
+typedef struct __mac_resource_handle	*mac_resource_handle_t;
+typedef struct __mac_notify_handle	*mac_notify_handle_t;
+typedef struct __mac_tx_notify_handle	*mac_tx_notify_handle_t;
+typedef	struct __mac_intr_handle	*mac_intr_handle_t;
+typedef	struct __mac_ring_handle	*mac_ring_handle_t;
+typedef	struct __mac_group_handle	*mac_group_handle_t;
 
-#define	XCVR_UNDEFINED		0
-#define	XCVR_NONE		1
-#define	XCVR_10			2
-#define	XCVR_100T4		3
-#define	XCVR_100X		4
-#define	XCVR_100T2		5
-#define	XCVR_1000X		6
-#define	XCVR_1000T		7
+#define	DATALINK_INVALID_LINKID	0
+#define	DATALINK_ALL_LINKID	0
+#define	DATALINK_MAX_LINKID	0xffffffff
+
+#define	MAC_MAX_MINOR	1000
+
+typedef	uint32_t		datalink_id_t;
 
 typedef enum {
 	LINK_STATE_UNKNOWN = -1,
@@ -82,10 +84,6 @@ typedef enum {
 	LINK_DUPLEX_FULL
 } link_duplex_t;
 
-#define	DATALINK_INVALID_LINKID	0
-#define	DATALINK_ALL_LINKID	0
-#define	DATALINK_MAX_LINKID	0xffffffff
-
 typedef enum {
 	LINK_FLOWCTRL_NONE = 0,
 	LINK_FLOWCTRL_RX,
@@ -93,7 +91,15 @@ typedef enum {
 	LINK_FLOWCTRL_BI
 } link_flowctrl_t;
 
-typedef uint32_t datalink_id_t;
+/*
+ * Maximum MAC address length
+ */
+#define	MAXMACADDRLEN		20
+
+typedef enum {
+	MAC_LOGTYPE_LINK = 1,
+	MAC_LOGTYPE_FLOW
+} mac_logtype_t;
 
 /*
  * Encodings for public properties.
@@ -153,15 +159,13 @@ typedef enum {
 	MAC_PROP_WL_DELKEY,
 	MAC_PROP_WL_KEY,
 	MAC_PROP_WL_MLME,
+	MAC_PROP_MAXBW,
+	MAC_PROP_PRIO,
+	MAC_PROP_BIND_CPU,
 	MAC_PROP_PRIVATE = -1
 } mac_prop_id_t;
 
 /*
- * Maximum MAC address length
- */
-#define	MAXMACADDRLEN	20
-
-/*
  * Flags to figure out r/w status of legacy ndd props.
  */
 #define	MAC_PROP_PERM_READ		0x0001
@@ -172,13 +176,6 @@ typedef enum {
 
 #ifdef	_KERNEL
 
-typedef struct mac_stat_info_s {
-	uint_t		msi_stat;
-	char		*msi_name;
-	uint_t		msi_type;	/* as defined in kstat_named_init(9F) */
-	uint64_t	msi_default;
-} mac_stat_info_t;
-
 /*
  * There are three ranges of statistics values.  0 to 1 - MAC_STAT_MIN are
  * interface statistics maintained by the mac module.  MAC_STAT_MIN to 1 -
@@ -259,27 +256,6 @@ typedef struct mac_info_s {
 } mac_info_t;
 
 /*
- * LSO capability
- */
-typedef struct lso_basic_tcp_ipv4_s {
-	t_uscalar_t	lso_max;		/* maximum payload */
-} lso_basic_tcp_ipv4_t;
-
-/*
- * Future LSO capabilities can be added at the end of the mac_capab_lso_t.
- * When such capability is added to the GLDv3 framework, the size of the
- * mac_capab_lso_t it allocates and passes to the drivers increases. Older
- * drivers wil access only the (upper) sections of that structure, that is the
- * sections carrying the capabilities they understand. This ensures the
- * interface can be safely extended in a binary compatible way.
- */
-typedef	struct mac_capab_lso_s {
-	t_uscalar_t		lso_flags;
-	lso_basic_tcp_ipv4_t	lso_basic_tcp_ipv4;
-	/* Add future lso capabilities here */
-} mac_capab_lso_t;
-
-/*
  * Information for legacy devices.
  */
 typedef struct mac_capab_legacy_s {
@@ -294,307 +270,32 @@ typedef struct mac_capab_legacy_s {
 } mac_capab_legacy_t;
 
 /*
- * MAC layer capabilities.  These capabilities are handled by the drivers'
- * mc_capab_get() callbacks.  Some capabilities require the driver to fill
- * in a given data structure, and others are simply boolean capabilities.
- * Note that capability values must be powers of 2 so that consumers and
- * providers of this interface can keep track of which capabilities they
- * care about by keeping a bitfield of these things around somewhere.
- */
-typedef enum {
-	MAC_CAPAB_HCKSUM	= 0x01, /* data is a uint32_t for the txflags */
-	MAC_CAPAB_POLL		= 0x02,	/* boolean only, no data */
-	MAC_CAPAB_MULTIADDRESS	= 0x04, /* data is multiaddress_capab_t */
-	MAC_CAPAB_LSO		= 0x08, /* data is mac_capab_lso_t */
-	MAC_CAPAB_NO_NATIVEVLAN	= 0x10, /* boolean only, no data */
-	MAC_CAPAB_NO_ZCOPY	= 0x20, /* boolean only, no data */
-	/* add new capabilities here */
-	MAC_CAPAB_RINGS		= 0x100, /* data is mac_capab_rings_t */
-	MAC_CAPAB_SHARES	= 0x200, /* data is mac_capab_share_t */
-
-	/* The following capabilities are specific to softmac. */
-	MAC_CAPAB_LEGACY	= 0x8000 /* data is mac_capab_legacy_t */
-} mac_capab_t;
-
-typedef int mac_addr_slot_t;
-
-/* mma_flags values */
-#define	MMAC_SLOT_USED		0x1   /* address slot used */
-#define	MMAC_SLOT_UNUSED	0x2   /* free address slot */
-#define	MMAC_VENDOR_ADDR	0x4   /* address returned is vendor supplied */
-
-typedef struct mac_multi_address_s {
-	mac_addr_slot_t	mma_slot;	/* slot for add/remove/get/set */
-	uint_t		mma_addrlen;
-	uint8_t		mma_addr[MAXMACADDRLEN];
-	uint_t		mma_flags;
-} mac_multi_addr_t;
-
-typedef int	(*maddr_reserve_t)(void *, mac_multi_addr_t *);
-typedef int	(*maddr_add_t)(void *, mac_multi_addr_t *);
-typedef int	(*maddr_remove_t)(void *, mac_addr_slot_t);
-typedef int	(*maddr_modify_t)(void *, mac_multi_addr_t *);
-typedef int	(*maddr_get_t)(void *, mac_multi_addr_t *);
-
-/* maddr_flag values */
-#define	MADDR_VENDOR_ADDR	0x01	/* addr returned is vendor supplied */
-
-/* multiple mac address: add/remove/set/get mac address */
-typedef struct multiaddress_capab_s {
-	int		maddr_naddr;	/* total addresses */
-	int		maddr_naddrfree;	/* free address slots */
-	uint_t		maddr_flag;	/* MADDR_VENDOR_ADDR bit can be set */
-	/* driver entry points */
-	void		*maddr_handle;	/* cookie to be used for the calls */
-	maddr_reserve_t	maddr_reserve;	/* reserve a factory address */
-	maddr_add_t	maddr_add;	/* add a new unicst address */
-	maddr_remove_t	maddr_remove;	/* remove an added address */
-	maddr_modify_t	maddr_modify;	/* modify an added address */
-	maddr_get_t	maddr_get;	/* get address from specified slot */
-} multiaddress_capab_t;
-
-/*
- * MAC driver entry point types.
- */
-typedef int		(*mac_getstat_t)(void *, uint_t, uint64_t *);
-typedef	int		(*mac_start_t)(void *);
-typedef void		(*mac_stop_t)(void *);
-typedef int		(*mac_setpromisc_t)(void *, boolean_t);
-typedef int		(*mac_multicst_t)(void *, boolean_t, const uint8_t *);
-typedef int		(*mac_unicst_t)(void *, const uint8_t *);
-typedef void		(*mac_ioctl_t)(void *, queue_t *, mblk_t *);
-typedef void		(*mac_resources_t)(void *);
-typedef mblk_t		*(*mac_tx_t)(void *, mblk_t *);
-typedef	boolean_t	(*mac_getcapab_t)(void *, mac_capab_t, void *);
-typedef	int		(*mac_open_t)(void *);
-typedef void		(*mac_close_t)(void *);
-typedef	int		(*mac_set_prop_t)(void *, const char *, mac_prop_id_t,
-    uint_t, const void *);
-typedef	int		(*mac_get_prop_t)(void *, const char *, mac_prop_id_t,
-    uint_t, uint_t, void *, uint_t *);
-
-/*
- * Drivers must set all of these callbacks except for mc_resources,
- * mc_ioctl, and mc_getcapab, which are optional.  If any of these optional
- * callbacks are set, their appropriate flags must be set in mc_callbacks.
- * Any future additions to this list must also be accompanied by an
- * associated mc_callbacks flag so that the framework can grow without
- * affecting the binary compatibility of the interface.
- */
-typedef struct mac_callbacks_s {
-	uint_t		mc_callbacks;	/* Denotes which callbacks are set */
-	mac_getstat_t	mc_getstat;	/* Get the value of a statistic */
-	mac_start_t	mc_start;	/* Start the device */
-	mac_stop_t	mc_stop;	/* Stop the device */
-	mac_setpromisc_t mc_setpromisc;	/* Enable or disable promiscuous mode */
-	mac_multicst_t	mc_multicst;	/* Enable or disable a multicast addr */
-	mac_unicst_t	mc_unicst;	/* Set the unicast MAC address */
-	mac_tx_t	mc_tx;		/* Transmit a packet */
-	mac_resources_t	mc_resources;	/* Get the device resources */
-	mac_ioctl_t	mc_ioctl;	/* Process an unknown ioctl */
-	mac_getcapab_t	mc_getcapab;	/* Get capability information */
-	mac_open_t	mc_open;	/* Open the device */
-	mac_close_t	mc_close;	/* Close the device */
-	mac_set_prop_t	mc_setprop;
-	mac_get_prop_t	mc_getprop;
-} mac_callbacks_t;
-
-typedef struct mac_priv_prop_s {
-	char	mpp_name[MAXLINKPROPNAME];
-	uint_t	mpp_flags;
-} mac_priv_prop_t;
-
-/*
- * Multiple Rings capability
- */
-typedef enum {
-	MAC_RING_TYPE_RX	= 1,	/* Receive ring */
-	MAC_RING_TYPE_TX	= 2	/* Transmit ring */
-} mac_ring_type_t;
-
-/*
- * Grouping type of a ring group
+ * When VNICs are created on top of the NIC, there are two levels
+ * of MAC layer, a lower MAC, which is the MAC layer at the level of the
+ * physical NIC, and an upper MAC, which is the MAC layer at the level
+ * of the VNIC. Each VNIC maps to a MAC client at the lower MAC, and
+ * the SRS and classification is done at the lower MAC level. The upper
+ * MAC is therefore for the most part pass-through, and therefore
+ * special processing needs to be done at the upper MAC layer when
+ * dealing with a VNIC.
  *
- * MAC_GROUP_TYPE_STATIC: The ring group can not be re-grouped.
- * MAC_GROUP_TYPE_DYNAMIC: The ring group support dynamic re-grouping
- */
-typedef enum {
-	MAC_GROUP_TYPE_STATIC	= 1,	/* Static ring group */
-	MAC_GROUP_TYPE_DYNAMIC	= 2	/* Dynamic ring group */
-} mac_group_type_t;
-
-typedef	struct __mac_ring_driver	*mac_ring_driver_t;
-typedef	struct __mac_ring_handle	*mac_ring_handle_t;
-typedef	struct __mac_group_driver	*mac_group_driver_t;
-typedef	struct __mac_group_handle	*mac_group_handle_t;
-typedef	struct __mac_intr_handle	*mac_intr_handle_t;
-
-typedef	struct mac_ring_info_s mac_ring_info_t;
-typedef	struct mac_group_info_s mac_group_info_t;
-
-typedef	int	(*mac_intr_enable_t)(mac_intr_handle_t);
-typedef	int	(*mac_intr_disable_t)(mac_intr_handle_t);
-
-typedef	struct mac_intr_s {
-	mac_intr_handle_t	mi_handle;
-	mac_intr_enable_t	mi_enable;
-	mac_intr_disable_t	mi_disable;
-} mac_intr_t;
-
-typedef void	(*mac_get_ring_t)(void *, mac_ring_type_t, const int, const int,
-    mac_ring_info_t *, mac_ring_handle_t);
-typedef void	(*mac_get_group_t)(void *, mac_ring_type_t, const int,
-    mac_group_info_t *, mac_group_handle_t);
-
-typedef void	(*mac_group_add_ring_t)(mac_group_driver_t,
-    mac_ring_driver_t, mac_ring_type_t);
-typedef void	(*mac_group_rem_ring_t)(mac_group_driver_t,
-    mac_ring_driver_t, mac_ring_type_t);
-
-/*
- * Multiple Rings Capability
- */
-typedef struct	mac_capab_rings_s {
-	mac_ring_type_t		mr_type;	/* Ring type */
-	mac_group_type_t	mr_group_type;	/* Grouping type */
-	void			*mr_handle;	/* Group Driver Handle. */
-	uint_t			mr_rnum;	/* Number of rings */
-	uint_t			mr_gnum;	/* Number of ring groups */
-	mac_get_ring_t		mr_rget;	/* Get ring from driver */
-	mac_get_group_t		mr_gget;	/* Get ring group from driver */
-	mac_group_add_ring_t	mr_gadd_ring;	/* Add ring into a group */
-	mac_group_rem_ring_t	mr_grem_ring;	/* Remove ring from a group */
-} mac_capab_rings_t;
-
-/*
- * Common ring functions and driver interfaces
+ * This capability allows the MAC layer to detect when a VNIC is being
+ * access, and implement the required shortcuts.
  */
-typedef	int	(*mac_ring_start_t)(mac_ring_driver_t);
-typedef	void	(*mac_ring_stop_t)(mac_ring_driver_t);
 
-typedef	mblk_t	*(*mac_ring_send_t)(void *, mblk_t *);
-typedef	mblk_t *(*mac_ring_poll_t)(void *, int);
+typedef void *(*mac_client_handle_fn_t)(void *);
 
-typedef struct mac_ring_info_s {
-	mac_ring_driver_t	mr_driver;
-	mac_ring_start_t	mr_start;
-	mac_ring_stop_t		mr_stop;
-	mac_intr_t		mr_intr;
-	union {
-		mac_ring_send_t	send;
-		mac_ring_poll_t	poll;
-	} mrfunion;
-} mac_ring_info_s;
-
-#define	mr_send		mrfunion.send
-#define	mr_poll		mrfunion.poll
-
-typedef	int	(*mac_group_start_t)(mac_group_driver_t);
-typedef	void	(*mac_group_stop_t)(mac_group_driver_t);
-typedef	int	(*mac_add_mac_addr_t)(void *, const uint8_t *);
-typedef	int	(*mac_rem_mac_addr_t)(void *, const uint8_t *);
-
-struct mac_group_info_s {
-	mac_group_driver_t	mrg_driver;	/* Driver reference */
-	mac_group_start_t	mrg_start;	/* Start the group */
-	mac_group_stop_t	mrg_stop;	/* Stop the group */
-	uint_t			mrg_count;	/* Count of rings */
-	mac_intr_t		mrg_intr;	/* Optional per-group intr */
-
-	/* Only used for rx groups */
-	mac_add_mac_addr_t	mrg_addmac;	/* Add a MAC address */
-	mac_rem_mac_addr_t	mrg_remmac;	/* Remove a MAC address */
-};
-
-/*
- * Share management functions.
- */
-typedef uint64_t mac_share_handle_t;
+typedef struct mac_capab_vnic_s {
+	void			*mcv_arg;
+	mac_client_handle_fn_t	mcv_mac_client_handle;
+} mac_capab_vnic_t;
 
-/*
- * Returns a Share handle to the client calling from above.
- */
-typedef int    (*mac_alloc_share_t)(void *, uint64_t cookie,
-    uint64_t *rcookie, mac_share_handle_t *);
-
-/*
- * Destroys the share previously allocated and unallocates
- * all share resources (e.g. DMA's assigned to the share).
- */
-typedef void (*mac_free_share_t)(mac_share_handle_t);
-
-typedef void (*mac_share_query_t)(mac_share_handle_t shdl,
-    mac_ring_type_t type, uint32_t *rmin, uint32_t *rmax,
-    uint64_t *rmap, uint64_t *gnum);
-
-/*
- * Basic idea, bind previously created ring groups to shares
- * for them to be exported (or shared) by another domain.
- * These interfaces bind/unbind the ring group to a share.  The
- * of doing such causes the resources to be shared with the guest.
- */
-typedef int (*mac_share_add_group_t)(mac_share_handle_t,
-    mac_group_handle_t);
-typedef int (*mac_share_rem_group_t)(mac_share_handle_t,
-    mac_group_handle_t);
-
-typedef struct  mac_capab_share_s {
-	uint_t			ms_snum;	/* Number of shares (vr's) */
-	void			*ms_handle;	/* Handle to driver. */
-	mac_alloc_share_t	ms_salloc;	/* Get a share from driver. */
-	mac_free_share_t	ms_sfree;	/* Return a share to driver. */
-	mac_share_add_group_t	ms_sadd;	/* Add a group to the share. */
-	mac_share_rem_group_t	ms_sremove;	/* Remove group from share. */
-	mac_share_query_t	ms_squery;	/* Query share constraints */
-} mac_capab_share_t;
+typedef void (*mac_rename_fn_t)(const char *, void *);
+typedef struct mac_capab_aggr_s {
+	mac_rename_fn_t mca_rename_fn;
+	int (*mca_unicst)(void *, const uint8_t *);
+} mac_capab_aggr_t;
 
-/*
- * Flags for mc_callbacks.  Requiring drivers to set the flags associated
- * with optional callbacks initialized in the structure allows the mac
- * module to add optional callbacks in the future without requiring drivers
- * to recompile.
- */
-#define	MC_RESOURCES	0x001
-#define	MC_IOCTL	0x002
-#define	MC_GETCAPAB	0x004
-#define	MC_OPEN		0x008
-#define	MC_CLOSE	0x010
-#define	MC_SETPROP	0x020
-#define	MC_GETPROP	0x040
-
-#define	MAC_MAX_MINOR	1000
-
-typedef struct mac_register_s {
-	uint_t			m_version;	/* set by mac_alloc() */
-	const char		*m_type_ident;
-	void			*m_driver;	/* Driver private data */
-	dev_info_t		*m_dip;
-	uint_t			m_instance;
-	uint8_t			*m_src_addr;
-	uint8_t			*m_dst_addr;
-	mac_callbacks_t		*m_callbacks;
-	uint_t			m_min_sdu;
-	uint_t			m_max_sdu;
-	void			*m_pdata;
-	size_t			m_pdata_size;
-	uint32_t		m_margin;
-	mac_priv_prop_t		*m_priv_props;
-	size_t			m_priv_prop_count;
-} mac_register_t;
-
-
-/*
- * Opaque handle types.
- */
-typedef	struct mac_t			*mac_handle_t;
-typedef struct __mac_notify_handle	*mac_notify_handle_t;
-typedef struct __mac_rx_handle		*mac_rx_handle_t;
-typedef struct __mac_txloop_handle	*mac_txloop_handle_t;
-typedef struct __mac_resource_handle	*mac_resource_handle_t;
-
-/*
- * MAC interface callback types.
- */
 typedef enum {
 	MAC_NOTE_LINK,
 	MAC_NOTE_PROMISC,
@@ -604,15 +305,15 @@ typedef enum {
 	MAC_NOTE_DEVPROMISC,
 	MAC_NOTE_FASTPATH_FLUSH,
 	MAC_NOTE_SDU_SIZE,
-	MAC_NOTE_VNIC,
 	MAC_NOTE_MARGIN,
+	MAC_NOTE_CAPAB_CHG,
 	MAC_NNOTE	/* must be the last entry */
 } mac_notify_type_t;
 
 typedef void		(*mac_notify_t)(void *, mac_notify_type_t);
-typedef void		(*mac_rx_t)(void *, mac_resource_handle_t, mblk_t *);
-typedef void		(*mac_txloop_t)(void *, mblk_t *);
-typedef void		(*mac_blank_t)(void *, time_t, uint_t);
+typedef void		(*mac_rx_t)(void *, mac_resource_handle_t, mblk_t *,
+			    boolean_t);
+typedef	mblk_t		*(*mac_receive_t)(void *, int);
 
 /*
  * MAC promiscuous types
@@ -629,26 +330,38 @@ typedef enum {
 	MAC_RX_FIFO = 1
 } mac_resource_type_t;
 
+typedef	int	(*mac_intr_enable_t)(mac_intr_handle_t);
+typedef	int	(*mac_intr_disable_t)(mac_intr_handle_t);
+
+typedef	struct mac_intr_s {
+	mac_intr_handle_t	mi_handle;
+	mac_intr_enable_t	mi_enable;
+	mac_intr_disable_t	mi_disable;
+} mac_intr_t;
+
 typedef struct mac_rx_fifo_s {
 	mac_resource_type_t	mrf_type;	/* MAC_RX_FIFO */
-	mac_blank_t		mrf_blank;
-	void			*mrf_arg;
-	time_t			mrf_normal_blank_time;
-	uint_t			mrf_normal_pkt_count;
+	mac_intr_t		mrf_intr;
+	mac_receive_t		mrf_receive;
+	void			*mrf_rx_arg;
+	uint32_t		mrf_flow_priority;
+	/*
+	 * The CPU this flow is to be processed on. With intrd and future
+	 * things, we should know which CPU the flow needs to be processed
+	 * and get a squeue assigned on that CPU.
+	 */
+	uint_t			mrf_cpu_id;
 } mac_rx_fifo_t;
 
-typedef struct mac_txinfo_s {
-	mac_tx_t		mt_fn;
-	void			*mt_arg;
-} mac_txinfo_t;
+#define	mrf_intr_handle		mrf_intr.mi_handle
+#define	mrf_intr_enable		mrf_intr.mi_enable
+#define	mrf_intr_disable	mrf_intr.mi_disable
 
 typedef union mac_resource_u {
 	mac_resource_type_t	mr_type;
 	mac_rx_fifo_t		mr_fifo;
 } mac_resource_t;
 
-typedef mac_resource_handle_t	(*mac_resource_add_t)(void *, mac_resource_t *);
-
 typedef enum {
 	MAC_ADDRTYPE_UNICAST,
 	MAC_ADDRTYPE_MULTICAST,
@@ -664,11 +377,29 @@ typedef struct mac_header_info_s {
 	uint32_t	mhi_bindsap;
 	mac_addrtype_t	mhi_dsttype;
 	uint16_t	mhi_tci;
-	uint_t		mhi_istagged:1,
-			mhi_prom_looped:1;
+	boolean_t	mhi_istagged;
 } mac_header_info_t;
 
 /*
+ * Function pointer to match dls client signature. Should be same as
+ * dls_rx_t to allow a soft ring to bypass DLS layer and call a DLS
+ * client directly.
+ */
+typedef	void		(*mac_direct_rx_t)(void *, mac_resource_handle_t,
+				mblk_t *, mac_header_info_t *);
+
+typedef mac_resource_handle_t	(*mac_resource_add_t)(void *, mac_resource_t *);
+typedef int			(*mac_resource_bind_t)(void *,
+    mac_resource_handle_t, processorid_t);
+typedef void			(*mac_resource_remove_t)(void *, void *);
+typedef void			(*mac_resource_quiesce_t)(void *, void *);
+typedef void			(*mac_resource_restart_t)(void *, void *);
+typedef int			(*mac_resource_modify_t)(void *, void *,
+				    mac_resource_t *);
+typedef	void			(*mac_change_upcall_t)(void *, mac_direct_rx_t,
+    void *);
+
+/*
  * MAC-Type plugin interfaces
  */
 
@@ -782,6 +513,13 @@ typedef struct mac_ndd_mapping_s {
 #define	mp_prop_id	u_mp_id.u_id
 #define	mp_kstat	u_mp_id.u_kstat
 
+typedef struct mac_stat_info_s {
+	uint_t		msi_stat;
+	char		*msi_name;
+	uint_t		msi_type;	/* as defined in kstat_named_init(9F) */
+	uint64_t	msi_default;
+} mac_stat_info_t;
+
 typedef struct mactype_register_s {
 	uint_t		mtr_version;	/* set by mactype_alloc() */
 	const char	*mtr_ident;
@@ -803,107 +541,25 @@ typedef struct mac_prop_s {
 } mac_prop_t;
 
 /*
- * Client interface functions.
+ * Driver interface functions.
  */
-extern int			mac_open(const char *, mac_handle_t *);
 extern int			mac_open_by_linkid(datalink_id_t,
 				    mac_handle_t *);
 extern int			mac_open_by_linkname(const char *,
 				    mac_handle_t *);
-extern void			mac_close(mac_handle_t);
-extern const mac_info_t		*mac_info(mac_handle_t);
-extern boolean_t		mac_info_get(const char *, mac_info_t *);
-extern uint64_t			mac_stat_get(mac_handle_t, uint_t);
-extern int			mac_start(mac_handle_t);
-extern void			mac_stop(mac_handle_t);
-extern int			mac_promisc_set(mac_handle_t, boolean_t,
-				    mac_promisc_type_t);
-extern boolean_t		mac_promisc_get(mac_handle_t,
-				    mac_promisc_type_t);
-extern int 			mac_multicst_add(mac_handle_t, const uint8_t *);
-extern int 			mac_multicst_remove(mac_handle_t,
-				    const uint8_t *);
-extern boolean_t		mac_unicst_verify(mac_handle_t,
-				    const uint8_t *, uint_t);
-extern int			mac_unicst_set(mac_handle_t, const uint8_t *);
-extern void			mac_unicst_get(mac_handle_t, uint8_t *);
-extern void			mac_dest_get(mac_handle_t, uint8_t *);
-extern void			mac_sdu_get(mac_handle_t, uint_t *, uint_t *);
-extern void			mac_resources(mac_handle_t);
-extern void			mac_ioctl(mac_handle_t, queue_t *, mblk_t *);
-extern const mac_txinfo_t	*mac_tx_get(mac_handle_t);
-extern const mac_txinfo_t	*mac_vnic_tx_get(mac_handle_t);
-extern link_state_t		mac_link_get(mac_handle_t);
-extern mac_notify_handle_t	mac_notify_add(mac_handle_t, mac_notify_t,
-				    void *);
-extern void			mac_notify_remove(mac_handle_t,
-				    mac_notify_handle_t);
-extern void			mac_notify(mac_handle_t);
-extern mac_rx_handle_t		mac_rx_add(mac_handle_t, mac_rx_t, void *);
-extern mac_rx_handle_t		mac_active_rx_add(mac_handle_t, mac_rx_t,
-				    void *);
-extern void			mac_rx_remove(mac_handle_t, mac_rx_handle_t,
-				    boolean_t);
-extern void			mac_rx_remove_wait(mac_handle_t);
-extern mblk_t			*mac_txloop(void *, mblk_t *);
-extern mac_txloop_handle_t	mac_txloop_add(mac_handle_t, mac_txloop_t,
-				    void *);
-extern void			mac_txloop_remove(mac_handle_t,
-				    mac_txloop_handle_t);
-extern boolean_t		mac_active_set(mac_handle_t);
-extern boolean_t		mac_active_shareable_set(mac_handle_t);
-extern void			mac_active_clear(mac_handle_t);
-extern void			mac_active_rx(void *, mac_resource_handle_t,
-				    mblk_t *);
-extern boolean_t		mac_vnic_set(mac_handle_t, mac_txinfo_t *,
-				    mac_getcapab_t, void *);
-extern void			mac_vnic_clear(mac_handle_t);
-extern void			mac_resource_set(mac_handle_t,
-				    mac_resource_add_t, void *);
-extern dev_info_t		*mac_devinfo_get(mac_handle_t);
 extern const char		*mac_name(mac_handle_t);
 extern minor_t			mac_minor(mac_handle_t);
-extern boolean_t		mac_capab_get(mac_handle_t, mac_capab_t,
-				    void *);
-extern boolean_t		mac_vnic_capab_get(mac_handle_t, mac_capab_t,
-				    void *);
-extern boolean_t		mac_sap_verify(mac_handle_t, uint32_t,
-				    uint32_t *);
-extern mblk_t			*mac_header(mac_handle_t, const uint8_t *,
-				    uint32_t, mblk_t *, size_t);
-extern int			mac_header_info(mac_handle_t, mblk_t *,
-				    mac_header_info_t *);
-extern mblk_t			*mac_header_cook(mac_handle_t, mblk_t *);
-extern mblk_t			*mac_header_uncook(mac_handle_t, mblk_t *);
 extern minor_t			mac_minor_hold(boolean_t);
 extern void			mac_minor_rele(minor_t);
+extern void			mac_sdu_get(mac_handle_t, uint_t *, uint_t *);
+extern int			mac_maxsdu_update(mac_handle_t, uint_t);
 
-/*
- * Driver interface functions.
- */
-extern mac_register_t		*mac_alloc(uint_t);
-extern void			mac_free(mac_register_t *);
-extern int			mac_register(mac_register_t *, mac_handle_t *);
-extern int			mac_disable(mac_handle_t);
-extern int  			mac_unregister(mac_handle_t);
-extern void 			mac_rx(mac_handle_t, mac_resource_handle_t,
-				    mblk_t *);
-extern void 			mac_link_update(mac_handle_t, link_state_t);
 extern void 			mac_unicst_update(mac_handle_t,
 				    const uint8_t *);
-extern void			mac_tx_update(mac_handle_t);
 extern void			mac_resource_update(mac_handle_t);
-extern mac_resource_handle_t	mac_resource_add(mac_handle_t,
-				    mac_resource_t *);
-extern int			mac_maxsdu_update(mac_handle_t, uint_t);
+extern void			mac_capab_update(mac_handle_t);
 extern int			mac_pdata_update(mac_handle_t, void *,
 				    size_t);
-extern void			mac_multicst_refresh(mac_handle_t,
-				    mac_multicst_t, void *, boolean_t);
-extern void			mac_unicst_refresh(mac_handle_t, mac_unicst_t,
-				    void *);
-extern void			mac_promisc_refresh(mac_handle_t,
-				    mac_setpromisc_t, void *);
 extern boolean_t		mac_margin_update(mac_handle_t, uint32_t);
 extern void			mac_margin_get(mac_handle_t, uint32_t *);
 extern int			mac_margin_remove(mac_handle_t, uint32_t);
@@ -912,18 +568,17 @@ extern int			mac_margin_add(mac_handle_t, uint32_t *,
 extern void			mac_init_ops(struct dev_ops *, const char *);
 extern void			mac_fini_ops(struct dev_ops *);
 extern uint32_t			mac_no_notification(mac_handle_t);
-extern boolean_t		mac_is_legacy(mac_handle_t);
-extern int			mac_hold_exclusive(mac_handle_t);
-extern void			mac_rele_exclusive(mac_handle_t);
 
 extern mactype_register_t	*mactype_alloc(uint_t);
 extern void			mactype_free(mactype_register_t *);
 extern int			mactype_register(mactype_register_t *);
 extern int			mactype_unregister(const char *);
-extern int			mac_set_prop(mac_handle_t, mac_prop_t *,
-				    void *, uint_t);
-extern int			mac_get_prop(mac_handle_t, mac_prop_t *,
-				    void *, uint_t, uint_t *);
+extern void			mac_set_ring(void *, void *);
+
+extern void			mac_start_logusage(mac_logtype_t, uint_t);
+extern void			mac_stop_logusage(mac_logtype_t);
+
+extern mac_handle_t		mac_get_lower_mac_handle(mac_handle_t);
 
 #endif	/* _KERNEL */
 
diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h
new file mode 100644
index 0000000000..f1743577ef
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_client.h
@@ -0,0 +1,184 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This file captures the MAC client API definitions. It can be
+ * included from any MAC clients.
+ */
+
+#ifndef	_SYS_MAC_CLIENT_H
+#define	_SYS_MAC_CLIENT_H
+
+#include <sys/mac.h>
+#include <sys/mac_flow.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef	_KERNEL
+
+/*
+ * MAC client interface.
+ */
+
+typedef struct __mac_client_handle *mac_client_handle_t;
+typedef struct __mac_unicast_handle *mac_unicast_handle_t;
+typedef struct __mac_promisc_handle *mac_promisc_handle_t;
+typedef struct __mac_perim_handle *mac_perim_handle_t;
+typedef uintptr_t mac_tx_cookie_t;
+
+typedef void (*mac_tx_notify_t)(void *, mac_tx_cookie_t);
+
+typedef enum {
+	MAC_DIAG_NONE,
+	MAC_DIAG_MACADDR_NIC,
+	MAC_DIAG_MACADDR_INUSE,
+	MAC_DIAG_MACADDR_INVALID,
+	MAC_DIAG_MACADDRLEN_INVALID,
+	MAC_DIAG_MACFACTORYSLOTINVALID,
+	MAC_DIAG_MACFACTORYSLOTUSED,
+	MAC_DIAG_MACFACTORYSLOTALLUSED,
+	MAC_DIAG_MACFACTORYNOTSUP,
+	MAC_DIAG_MACPREFIX_INVALID,
+	MAC_DIAG_MACPREFIXLEN_INVALID,
+	MAC_DIAG_MACNO_HWRINGS
+} mac_diag_t;
+
+typedef enum {
+	MAC_CLIENT_PROMISC_ALL,
+	MAC_CLIENT_PROMISC_FILTERED,
+	MAC_CLIENT_PROMISC_MULTI
+} mac_client_promisc_type_t;
+
+/* flags passed to mac_unicast_add() */
+#define	MAC_UNICAST_NODUPCHECK		0x0001
+#define	MAC_UNICAST_PRIMARY		0x0002
+#define	MAC_UNICAST_HW			0x0004
+#define	MAC_UNICAST_VNIC_PRIMARY	0x0008
+
+/* flags passed to mac_client_open */
+#define	MAC_OPEN_FLAGS_IS_VNIC			0x0001
+#define	MAC_OPEN_FLAGS_EXCLUSIVE		0x0002
+#define	MAC_OPEN_FLAGS_TAG_DISABLE		0x0004
+#define	MAC_OPEN_FLAGS_IS_AGGR_PORT		0x0008
+#define	MAC_OPEN_FLAGS_STRIP_DISABLE		0x0010
+#define	MAC_OPEN_FLAGS_NO_HWRINGS		0x0020
+#define	MAC_OPEN_FLAGS_SHARES_DESIRED		0x0040
+#define	MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK	0x0080
+#define	MAC_OPEN_FLAGS_USE_DATALINK_NAME	0x0100
+#define	MAC_OPEN_FLAGS_REQ_HWRINGS		0x0200
+
+/* flags passed to mac_client_close */
+#define	MAC_CLOSE_FLAGS_IS_VNIC		0x0001
+#define	MAC_CLOSE_FLAGS_EXCLUSIVE	0x0002
+#define	MAC_CLOSE_FLAGS_IS_AGGR_PORT	0x0004
+
+/* flags passed to mac_promisc_add() */
+#define	MAC_PROMISC_FLAGS_NO_TX_LOOP	0x0001
+#define	MAC_PROMISC_FLAGS_NO_PHYS	0x0002
+
+/* flags passed to mac_tx() */
+#define	MAC_DROP_ON_NO_DESC	0x01 /* freemsg() if no tx descs */
+#define	MAC_TX_NO_ENQUEUE	0x02 /* don't enqueue mblks if not xmit'ed */
+#define	MAC_TX_NO_HOLD		0x04 /* don't bump the active Tx count */
+
+extern int mac_client_open(mac_handle_t, mac_client_handle_t *, char *,
+    uint16_t);
+extern void mac_client_close(mac_client_handle_t, uint16_t);
+
+extern int mac_unicast_add(mac_client_handle_t, uint8_t *, uint16_t,
+    mac_unicast_handle_t *, uint16_t, mac_diag_t *);
+extern int mac_unicast_primary_add(mac_client_handle_t, mac_unicast_handle_t *,
+    mac_diag_t *);
+extern int mac_unicast_remove(mac_client_handle_t, mac_unicast_handle_t);
+
+extern int mac_multicast_add(mac_client_handle_t, const uint8_t *);
+extern void mac_multicast_remove(mac_client_handle_t, const uint8_t *);
+
+extern void mac_rx_set(mac_client_handle_t, mac_rx_t, void *);
+extern void mac_rx_clear(mac_client_handle_t);
+extern mac_tx_cookie_t mac_tx(mac_client_handle_t, mblk_t *,
+    uintptr_t, uint16_t, mblk_t **);
+extern boolean_t mac_tx_is_flow_blocked(mac_client_handle_t, mac_tx_cookie_t);
+extern uint64_t mac_client_stat_get(mac_client_handle_t, uint_t);
+
+extern int mac_promisc_add(mac_client_handle_t, mac_client_promisc_type_t,
+    mac_rx_t, void *, mac_promisc_handle_t *, uint16_t);
+extern int mac_promisc_remove(mac_promisc_handle_t);
+
+extern mac_notify_handle_t mac_notify_add(mac_handle_t, mac_notify_t, void *);
+extern int mac_notify_remove(mac_notify_handle_t, boolean_t);
+extern void mac_notify_remove_wait(mac_handle_t);
+extern int mac_rename_primary(mac_handle_t, const char *);
+extern	char *mac_client_name(mac_client_handle_t);
+
+extern int mac_open(const char *, mac_handle_t *);
+extern void mac_close(mac_handle_t);
+extern uint64_t mac_stat_get(mac_handle_t, uint_t);
+
+extern int mac_unicast_primary_set(mac_handle_t, const uint8_t *);
+extern void mac_unicast_primary_get(mac_handle_t, uint8_t *);
+extern void mac_unicast_primary_info(mac_handle_t, char *, boolean_t *);
+
+extern int mac_addr_random(mac_client_handle_t, uint_t, uint8_t *,
+    mac_diag_t *);
+
+extern int mac_addr_factory_reserve(mac_client_handle_t, int *);
+extern void mac_addr_factory_release(mac_client_handle_t, uint_t);
+extern void mac_addr_factory_value(mac_handle_t, int, uchar_t *, uint_t *,
+    char *, boolean_t *);
+extern uint_t mac_addr_factory_num(mac_handle_t);
+
+extern uint_t mac_addr_len(mac_handle_t);
+
+extern mac_tx_notify_handle_t mac_client_tx_notify(mac_client_handle_t,
+    mac_tx_notify_t, void *);
+
+extern int mac_set_resources(mac_handle_t, mac_resource_props_t *);
+extern void mac_get_resources(mac_handle_t, mac_resource_props_t *);
+extern int mac_client_set_resources(mac_client_handle_t,
+    mac_resource_props_t *);
+extern void mac_client_get_resources(mac_client_handle_t,
+    mac_resource_props_t *);
+
+extern int mac_share_capable(mac_handle_t);
+extern int mac_share_bind(mac_client_handle_t, uint64_t, uint64_t *);
+extern void mac_share_unbind(mac_client_handle_t);
+
+extern int mac_set_mtu(mac_handle_t, uint_t, uint_t *);
+
+extern uint_t mac_hwgrp_num(mac_handle_t);
+extern void mac_get_hwgrp_info(mac_handle_t, int, uint_t *, uint_t *,
+    uint_t *, uint_t *, char *);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_MAC_CLIENT_H */
diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h
new file mode 100644
index 0000000000..29d2a40ff1
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_client_impl.h
@@ -0,0 +1,318 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_MAC_CLIENT_IMPL_H
+#define	_SYS_MAC_CLIENT_IMPL_H
+
+#include <sys/modhash.h>
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
+#include <sys/mac.h>
+#include <sys/mac_impl.h>
+#include <net/if.h>
+#include <sys/mac_flow_impl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+extern kmem_cache_t	*mac_client_impl_cache;
+extern kmem_cache_t	*mac_unicast_impl_cache;
+extern kmem_cache_t	*mac_promisc_impl_cache;
+
+/*
+ * Need a list to chain all VIDs assigned to a client. Normally, one
+ * MAC client only has one VID. But vsw might need multiple VIDs.
+ */
+typedef struct mac_unicast_impl_s {			/* Protected by */
+	struct mac_unicast_impl_s	*mui_next;	/* SL */
+	mac_address_t			*mui_map;	/* SL */
+	uint16_t			mui_vid;	/* SL */
+} mac_unicast_impl_t;
+
+#define	MAC_CLIENT_FLAGS_PRIMARY	0X0001
+#define	MAC_CLIENT_FLAGS_VNIC_PRIMARY	0x0002
+
+/*
+ * One of these is instantiated per MAC client promiscuous callback.
+ *
+ * Each element of this structure belongs to two linked list. One
+ * for the mac_client_impl_t (mci_promisc_list) which created allocated
+ * the callback, the other for the mac_impl_t (mi_promisc_list) corresponding
+ * to the MAC client.
+ * The former allows us to do bookkeeping, the latter allows us
+ * to more efficiently dispatch packets to the promiscuous callbacks.
+ */
+typedef struct mac_promisc_impl_s {			/* Protected by */
+	mac_cb_t			mpi_mci_link;	/* mi_promisc_lock */
+	mac_cb_t			mpi_mi_link;	/* mi_promisc_lock */
+	mac_client_promisc_type_t	mpi_type;	/* WO */
+	mac_rx_t			mpi_fn;		/* WO */
+	void				*mpi_arg;	/* WO */
+	struct mac_client_impl_s	*mpi_mcip;	/* WO */
+	boolean_t			mpi_no_tx_loop;	/* WO */
+	boolean_t			mpi_no_phys;	/* WO */
+} mac_promisc_impl_t;
+
+typedef union mac_tx_percpu_s {
+	struct {
+		kmutex_t	_pcpu_tx_lock;
+		uint_t		_pcpu_tx_refcnt;
+	} pcpu_lr;
+	uchar_t		pcpu_pad[64];
+} mac_tx_percpu_t;
+
+#define	pcpu_tx_lock	pcpu_lr._pcpu_tx_lock
+#define	pcpu_tx_refcnt	pcpu_lr._pcpu_tx_refcnt
+
+/*
+ * One of these is instanciated for each MAC client.
+ */
+struct mac_client_impl_s {			/* Protected by */
+	struct mac_client_impl_s *mci_client_next;	/* mi_rw_lock */
+	char			mci_name[MAXNAMELEN];	/* mi_rw_lock */
+	/*
+	 * This flow entry will contain all the internal constructs
+	 * such as SRS etc. for this MAC client. The MAC client may
+	 * have more than one flow corresponding to each upper client
+	 * sharing this mac_client_impl_t.
+	 */
+	flow_entry_t		*mci_flent;		/* mi_rw_lock */
+	struct mac_impl_s	*mci_mip;		/* WO */
+	/*
+	 * If this is a client that has a pass thru MAC (e.g. a VNIC),
+	 * then we also keep the handle for the client's upper MAC.
+	 */
+	struct mac_impl_s	*mci_upper_mip;		/* WO */
+
+	uint32_t		mci_state_flags;	/* WO */
+	mac_rx_t		mci_rx_fn;		/* Rx Quiescence */
+	void			*mci_rx_arg;		/* Rx Quiescence */
+	mac_direct_rx_t		mci_direct_rx_fn;	/* SL */
+	void			*mci_direct_rx_arg;	/* SL */
+
+	mac_cb_t		*mci_promisc_list;	/* mi_promisc_lock */
+
+	mac_address_t		*mci_unicast;
+	uint32_t		mci_flags;		/* SL */
+	krwlock_t		mci_rw_lock;
+	mac_unicast_impl_t	*mci_unicast_list;	/* mci_rw_lock */
+	/*
+	 * The mac_client_impl_t may be shared by multiple clients, i.e
+	 * multiple VLANs sharing the same MAC client. In this case the
+	 * address/vid tubles differ and are each associated with their
+	 * own flow entry, but the rest underlying components SRS, etc,
+	 * are common.
+	 */
+	flow_entry_t		*mci_flent_list;	/* mci_rw_lock */
+	uint_t			mci_nflents;		/* mci_rw_lock */
+	uint_t			mci_nvids;		/* mci_rw_lock */
+
+	/* Resource Management Functions */
+	mac_resource_add_t	mci_resource_add;	/* SL */
+	mac_resource_remove_t	mci_resource_remove;	/* SL */
+	mac_resource_quiesce_t	mci_resource_quiesce;	/* SL */
+	mac_resource_restart_t	mci_resource_restart;	/* SL */
+	mac_resource_bind_t	mci_resource_bind;	/* SL */
+	void			*mci_resource_arg;	/* SL */
+
+
+	/* Tx notify callback */
+	kmutex_t		mci_tx_cb_lock;
+	mac_cb_info_t		mci_tx_notify_cb_info;	/* cb list info */
+	mac_cb_t		*mci_tx_notify_cb_list;	/* The cb list */
+	uintptr_t		mci_tx_notify_id;
+
+	/* per MAC client stats */			/* None */
+	uint64_t		mci_stat_multircv;
+	uint64_t		mci_stat_brdcstrcv;
+	uint64_t		mci_stat_multixmt;
+	uint64_t		mci_stat_brdcstxmt;
+	uint64_t		mci_stat_obytes;
+	uint64_t		mci_stat_opackets;
+	uint64_t		mci_stat_oerrors;
+	uint64_t		mci_stat_ibytes;
+	uint64_t		mci_stat_ipackets;
+	uint64_t		mci_stat_ierrors;
+
+	flow_tab_t		*mci_subflow_tab;	/* Rx quiescence */
+
+	/*
+	 * Priority range for this MAC client. This the range
+	 * corresponding to the priority configured (nr_flow_priority).
+	 */
+	pri_t			mci_min_pri;
+	pri_t			mci_max_pri;
+
+	/*
+	 * Hybrid I/O related definitions.
+	 */
+	mac_share_handle_t	mci_share;
+	boolean_t		mci_share_bound;
+	boolean_t		mci_no_hwrings;
+
+	/* The client requests a hardware group */
+	boolean_t		mci_req_hwrings;
+
+	/* for multicast support */
+	struct mac_mcast_addrs_s *mci_mcast_addrs;	/* mi_rw_lock */
+
+	/*
+	 * Protected by mci_tx_pcpu[0].pcpu_tx_lock
+	 */
+	uint_t			mci_tx_flag;
+	kcondvar_t		mci_tx_cv;
+
+	/* Must be last in the structure for dynamic sizing */
+	mac_tx_percpu_t		mci_tx_pcpu[1];		/* SL */
+};
+
+#define	MAC_CLIENT_IMPL_SIZE						\
+	(sizeof (mac_client_impl_t) +					\
+	    (mac_tx_percpu_cnt * sizeof (mac_tx_percpu_t)))
+
+extern	int	mac_tx_percpu_cnt;
+
+#define	MCIP_TX_SRS(mcip)	\
+	((mcip)->mci_flent == NULL ? NULL : (mcip)->mci_flent->fe_tx_srs)
+
+/* Defensive coding, non-null mcip_flent could be an assert */
+
+#define	MCIP_DATAPATH_SETUP(mcip)		\
+	((mcip)->mci_flent == NULL ? B_FALSE :	\
+	!((mcip)->mci_flent->fe_flags & FE_MC_NO_DATAPATH))
+
+#define	MCIP_RESOURCE_PROPS(mcip)		\
+	((mcip)->mci_flent == NULL ? NULL :	\
+	&(mcip)->mci_flent->fe_resource_props)
+
+#define	MCIP_EFFECTIVE_PROPS(mcip)		\
+	(mcip->mci_flent == NULL ? NULL : 	\
+	&(mcip)->mci_flent->fe_effective_props)
+
+#define	MCIP_RESOURCE_PROPS_MASK(mcip)		\
+	((mcip)->mci_flent == NULL ? 0 :	\
+	(mcip)->mci_flent->fe_resource_props.mrp_mask)
+
+#define	MCIP_RESOURCE_PROPS_MAXBW(mcip)		\
+	((mcip)->mci_flent == NULL ? 0 :	\
+	(mcip)->mci_flent->fe_resource_props.mrp_maxbw)
+
+#define	MCIP_RESOURCE_PROPS_PRIORITY(mcip)		\
+	((mcip)->mci_flent == NULL ? 0 :	\
+	(mcip)->mci_flent->fe_resource_props.mrp_priority)
+
+#define	MCIP_RESOURCE_PROPS_CPUS(mcip)		\
+	((mcip)->mci_flent == NULL ? 0 :	\
+	&(mcip)->mci_flent->fe_resource_props.mrp_cpus)
+
+#define	MCIP_RESOURCE_PROPS_NCPUS(mcip)		\
+	((mcip)->mci_flent == NULL ? 0 :	\
+	(mcip)->mci_flent->fe_resource_props.mrp_ncpus)
+
+#define	MCIP_RESOURCE_PROPS_CPU(mcip)		\
+	((mcip)->mci_flent == NULL ? 0 :	\
+	(mcip)->mci_flent->fe_resource_props.mrp_ncpu)
+
+/*
+ * We validate the VLAN id of the packet w.r.t the client's vid,
+ * if required (i.e. !MCIS_DISABLE_TX_VID_CHECK). DLS clients
+ * will have MCIS_DISABLE_TX_VID_CHECK set.
+ * (In the case of aggr when we get back packets, due to
+ * the underlying driver being flow controlled, we won't
+ * drop the packet even if it is VLAN tagged as we
+ * don't set MCIS_DISABLE_TX_VID_CHECK for an aggr.)
+ */
+#define	MAC_VID_CHECK_NEEDED(mcip)					\
+	(((mcip)->mci_state_flags & MCIS_DISABLE_TX_VID_CHECK) == 0 &&	\
+	(mcip)->mci_mip->mi_info.mi_nativemedia == DL_ETHER)
+
+#define	MAC_VID_CHECK(mcip, mp, err) {					\
+	if (ntohs(((struct ether_header *)(mp)->b_rptr)->ether_type) ==	\
+	    ETHERTYPE_VLAN) {						\
+		/*							\
+		 * err is set to EINVAL (so the caller can take the	\
+		 * appropriate action. e.g. freemsg()) for two cases:	\
+		 * -client is not responsible for filling in the vid.	\
+		 * -client is responsible for filling in the vid, but	\
+		 *  the vid doesn't match the vid of the MAC client.	\
+		 */							\
+		(err) = EINVAL;						\
+		if (((mcip)->mci_state_flags & MCIS_TAG_DISABLE) != 0) {\
+			struct ether_vlan_header	*evhp;		\
+			uint16_t			vlanid;		\
+									\
+			evhp = (struct ether_vlan_header *)(mp)->b_rptr;\
+			vlanid = VLAN_ID(ntohs(evhp->ether_tci));	\
+			if (mac_client_check_flow_vid((mcip), vlanid))	\
+				(err) = 0;				\
+		}							\
+	}								\
+}
+
+#define	MAC_TAG_NEEDED(mcip)						\
+	(((mcip)->mci_state_flags & MCIS_TAG_DISABLE) == 0 &&		\
+	(mcip)->mci_nvids == 1)						\
+
+/* MCI state flags */
+#define	MCIS_IS_VNIC			0x0001
+#define	MCIS_EXCLUSIVE			0x0002
+#define	MCIS_TAG_DISABLE		0x0004
+#define	MCIS_STRIP_DISABLE		0x0008
+#define	MCIS_IS_AGGR_PORT		0x0010
+#define	MCIS_CLIENT_POLL_CAPABLE	0x0020
+#define	MCIS_DESC_LOGGED		0x0040
+#define	MCIS_SHARE_BOUND		0x0080
+#define	MCIS_NO_HWRINGS			0x0100
+#define	MCIS_DISABLE_TX_VID_CHECK	0x0200
+#define	MCIS_USE_DATALINK_NAME		0x0400
+
+/* in mac_client.c */
+extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *);
+extern void mac_client_init(void);
+extern void mac_client_fini(void);
+extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *,
+    mac_client_impl_t *);
+
+extern int mac_validate_props(mac_resource_props_t *);
+
+extern mac_client_impl_t *mac_vnic_lower(mac_impl_t *);
+extern mac_client_impl_t *mac_primary_client_handle(mac_impl_t *);
+extern uint16_t i_mac_flow_vid(flow_entry_t *);
+extern boolean_t i_mac_capab_get(mac_handle_t, mac_capab_t, void *);
+
+extern void mac_unicast_update_clients(mac_impl_t *, mac_address_t *);
+extern void mac_update_resources(mac_resource_props_t *,
+    mac_resource_props_t *, boolean_t);
+
+boolean_t mac_client_check_flow_vid(mac_client_impl_t *, uint16_t);
+
+extern boolean_t mac_is_primary_client(mac_client_impl_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_MAC_CLIENT_IMPL_H */
diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h
new file mode 100644
index 0000000000..7e22552aeb
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_client_priv.h
@@ -0,0 +1,149 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This file contains *private* MAC API definitions. This header file
+ * should only be included by kernel components which are part of the
+ * GLDv3 stack (dld, dls, aggr, softmac).
+ */
+
+#ifndef	_SYS_MAC_CLIENT_PRIV_H
+#define	_SYS_MAC_CLIENT_PRIV_H
+
+#include <sys/mac.h>
+#include <sys/mac_flow.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef	_KERNEL
+
+#ifdef DEBUG
+#define	MAC_PERIM_HELD(mph)		mac_perim_held(mph)
+#else
+#define	MAC_PERIM_HELD(mph)
+#endif
+
+extern boolean_t mac_rx_bypass_set(mac_client_handle_t, mac_direct_rx_t,
+    void *);
+
+extern const mac_info_t *mac_info(mac_handle_t);
+extern boolean_t mac_info_get(const char *, mac_info_t *);
+extern int mac_promisc_set(mac_handle_t, boolean_t, mac_promisc_type_t);
+extern boolean_t mac_promisc_get(mac_handle_t, mac_promisc_type_t);
+
+extern void mac_ioctl(mac_handle_t, queue_t *, mblk_t *);
+extern link_state_t mac_link_get(mac_handle_t);
+extern void mac_resource_set(mac_client_handle_t, mac_resource_add_t, void *);
+extern dev_info_t *mac_devinfo_get(mac_handle_t);
+extern boolean_t mac_capab_get(mac_handle_t, mac_capab_t, void *);
+extern boolean_t mac_sap_verify(mac_handle_t, uint32_t, uint32_t *);
+extern mblk_t *mac_header(mac_handle_t, const uint8_t *, uint32_t, mblk_t *,
+    size_t);
+extern int mac_header_info(mac_handle_t, mblk_t *, mac_header_info_t *);
+extern mblk_t *mac_header_cook(mac_handle_t, mblk_t *);
+extern mblk_t *mac_header_uncook(mac_handle_t, mblk_t *);
+
+extern void mac_resource_set_common(mac_client_handle_t,
+    mac_resource_add_t, mac_resource_remove_t, mac_resource_quiesce_t,
+    mac_resource_restart_t, mac_resource_bind_t, void *);
+
+extern	void mac_perim_enter_by_mh(mac_handle_t, mac_perim_handle_t *);
+extern	int mac_perim_enter_by_macname(const char *, mac_perim_handle_t *);
+extern	int mac_perim_enter_by_linkid(datalink_id_t, mac_perim_handle_t *);
+extern	void mac_perim_exit(mac_perim_handle_t);
+extern	boolean_t mac_perim_held(mac_handle_t);
+
+extern	uint16_t mac_client_vid(mac_client_handle_t);
+extern int mac_vnic_unicast_set(mac_client_handle_t, const uint8_t *);
+
+extern void mac_client_poll_enable(mac_client_handle_t);
+extern void mac_client_poll_disable(mac_client_handle_t);
+
+extern int mac_resource_ctl_set(mac_client_handle_t, mac_resource_props_t *);
+extern void mac_resource_ctl_get(mac_client_handle_t, mac_resource_props_t *);
+
+/*
+ * Flow-related APIs for MAC clients.
+ */
+
+extern void mac_link_init_flows(mac_client_handle_t);
+extern void mac_link_release_flows(mac_client_handle_t);
+extern int mac_link_flow_add(datalink_id_t, char *, flow_desc_t *,
+    mac_resource_props_t *);
+extern int mac_link_flow_remove(char *);
+extern int mac_link_flow_modify(char *, mac_resource_props_t *);
+extern boolean_t mac_link_has_flows(mac_client_handle_t);
+
+typedef struct {
+	char			fi_flow_name[MAXNAMELEN];
+	datalink_id_t		fi_link_id;
+	flow_desc_t		fi_flow_desc;
+	mac_resource_props_t	fi_resource_props;
+} mac_flowinfo_t;
+
+extern int mac_link_flow_walk(datalink_id_t,
+    int (*)(mac_flowinfo_t *, void *), void *);
+extern int mac_link_flow_info(char *, mac_flowinfo_t *);
+
+extern void	*mac_tx_hold(mac_client_handle_t);
+extern void	mac_tx_rele(mac_client_handle_t, void *);
+extern void	mac_rx_client_quiesce(mac_client_handle_t);
+extern void	mac_rx_client_restart(mac_client_handle_t);
+extern void	mac_srs_perm_quiesce(mac_client_handle_t, boolean_t);
+extern int	mac_hwrings_get(mac_client_handle_t, mac_group_handle_t *,
+		    mac_ring_handle_t *);
+extern void	mac_hwring_setup(mac_ring_handle_t, mac_resource_handle_t);
+extern void	mac_hwring_teardown(mac_ring_handle_t);
+extern int	mac_hwring_disable_intr(mac_ring_handle_t);
+extern int	mac_hwring_enable_intr(mac_ring_handle_t);
+extern int	mac_hwring_start(mac_ring_handle_t);
+extern void	mac_hwring_stop(mac_ring_handle_t);
+extern mblk_t	*mac_hwring_poll(mac_ring_handle_t, int);
+#define		MAC_HWRING_POLL(ring, bytes)		\
+		(((ring)->mr_info.mri_poll)		\
+		((ring)->mr_info.mri_driver, (bytes)))
+
+extern int	mac_hwgroup_addmac(mac_group_handle_t, const uint8_t *);
+extern int	mac_hwgroup_remmac(mac_group_handle_t, const uint8_t *);
+
+extern void	mac_set_upper_mac(mac_client_handle_t, mac_handle_t);
+
+extern int mac_mark_exclusive(mac_handle_t);
+extern void mac_unmark_exclusive(mac_handle_t);
+
+extern int32_t	mac_client_intr_cpu(mac_client_handle_t);
+extern void	mac_client_set_intr_cpu(void *, mac_client_handle_t, int32_t);
+extern void	*mac_get_devinfo(mac_handle_t);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_MAC_CLIENT_PRIV_H */
diff --git a/usr/src/uts/common/sys/mac_flow.h b/usr/src/uts/common/sys/mac_flow.h
new file mode 100644
index 0000000000..05ed62a217
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_flow.h
@@ -0,0 +1,210 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_MAC_FLOW_H
+#define	_MAC_FLOW_H
+
+/*
+ * Main structure describing a flow of packets, for classification use
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <netinet/in.h>		/* for IPPROTO_* constants */
+#include <sys/ethernet.h>
+
+#define	MAXFLOWNAME		32
+
+/* need to use MAXMACADDRLEN from dld.h instead of this one */
+#define	MAXMACADDR		20
+
+/* Bit-mask for the selectors carried in the flow descriptor */
+typedef	uint64_t		flow_mask_t;
+
+#define	FLOW_LINK_DST		0x00000001	/* Destination MAC addr */
+#define	FLOW_LINK_SRC		0x00000002	/* Source MAC address */
+#define	FLOW_LINK_VID		0x00000004	/* VLAN ID */
+#define	FLOW_LINK_SAP		0x00000008	/* SAP value */
+
+#define	FLOW_IP_VERSION		0x00000010	/* V4 or V6 */
+#define	FLOW_IP_PROTOCOL	0x00000020	/* Protocol type */
+#define	FLOW_IP_LOCAL		0x00000040	/* Local address */
+#define	FLOW_IP_REMOTE		0x00000080	/* Remote address */
+#define	FLOW_IP_DSFIELD		0x00000100	/* DSfield value */
+
+#define	FLOW_ULP_PORT_LOCAL	0x00001000	/* ULP local port */
+#define	FLOW_ULP_PORT_REMOTE	0x00002000	/* ULP remote port */
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct flow_desc_s {
+	flow_mask_t			fd_mask;
+	uint32_t			fd_mac_len;
+	uint8_t				fd_dst_mac[MAXMACADDR];
+	uint8_t				fd_src_mac[MAXMACADDR];
+	uint16_t			fd_vid;
+	uint32_t			fd_sap;
+	uint8_t				fd_ipversion;
+	uint8_t				fd_protocol;
+	in6_addr_t			fd_local_addr;
+	in6_addr_t			fd_local_netmask;
+	in6_addr_t			fd_remote_addr;
+	in6_addr_t			fd_remote_netmask;
+	in_port_t			fd_local_port;
+	in_port_t			fd_remote_port;
+	uint8_t				fd_dsfield;
+	uint8_t				fd_dsfield_mask;
+} flow_desc_t;
+
+#define	MRP_NCPUS	128
+
+/*
+ * In MCM_CPUS mode, cpu bindings is user specified. In MCM_FANOUT mode,
+ * user only specifies a fanout count.
+ * mc_fanout_cnt gives the number of CPUs used for fanout soft rings.
+ * mc_fanout_cpus[] array stores the CPUs used for fanout soft rings.
+ */
+typedef enum {
+	MCM_FANOUT = 1,
+	MCM_CPUS
+} mac_cpu_mode_t;
+
+typedef struct mac_cpus_props_s {
+	uint32_t		mc_ncpus;		/* num of cpus */
+	uint32_t		mc_cpus[MRP_NCPUS]; 	/* cpu list */
+	uint32_t		mc_fanout_cnt;		/* soft ring cpu cnt */
+	uint32_t		mc_fanout_cpus[MRP_NCPUS]; /* SR cpu list */
+	uint32_t		mc_pollid;		/* poll thr binding */
+	uint32_t		mc_workerid;		/* worker thr binding */
+	/*
+	 * interrupt cpu: mrp_intr_cpu less than 0 implies platform limitation
+	 * in retargetting the interrupt assignment.
+	 */
+	int32_t			mc_intr_cpu;
+	mac_cpu_mode_t		mc_fanout_mode;		/*  fanout mode */
+} mac_cpus_t;
+
+/* Priority values */
+typedef enum {
+	MPL_LOW,
+	MPL_MEDIUM,
+	MPL_HIGH,
+	MPL_RESET
+} mac_priority_level_t;
+
+/* The default priority for links */
+#define	MPL_LINK_DEFAULT		MPL_HIGH
+
+/* The default priority for flows */
+#define	MPL_SUBFLOW_DEFAULT		MPL_MEDIUM
+
+#define	MRP_MAXBW		0x00000001 	/* Limit set */
+#define	MRP_CPUS		0x00000002 	/* CPU/fanout set */
+#define	MRP_CPUS_USERSPEC	0x00000004 	/* CPU/fanout from user */
+#define	MRP_PRIORITY		0x00000008 	/* Priority set */
+
+#define	MRP_THROTTLE		MRP_MAXBW
+
+/* 3 levels - low, medium, high */
+#define	MRP_PRIORITY_LEVELS		3
+
+/* Special value denoting no bandwidth control */
+#define	MRP_MAXBW_RESETVAL		-1ULL
+
+/*
+ * Until sub-megabit limit is implemented,
+ * reject values lower than 1 MTU per tick or 1.2Mbps
+ */
+#define	MRP_MAXBW_MINVAL		1200000
+
+typedef	struct mac_resource_props_s {
+	/*
+	 * Bit-mask for the network resource control types types
+	 */
+	uint32_t		mrp_mask;
+	uint64_t		mrp_maxbw;	/* bandwidth limit in bps */
+	mac_priority_level_t	mrp_priority;	/* relative flow priority */
+	mac_cpus_t		mrp_cpus;
+} mac_resource_props_t;
+
+#define	mrp_ncpus	mrp_cpus.mc_ncpus
+#define	mrp_cpu		mrp_cpus.mc_cpus
+#define	mrp_fanout_cnt	mrp_cpus.mc_fanout_cnt
+#define	mrp_fanout_cpu	mrp_cpus.mc_fanout_cpus
+#define	mrp_pollid	mrp_cpus.mc_pollid
+#define	mrp_workerid	mrp_cpus.mc_workerid
+#define	mrp_intr_cpu	mrp_cpus.mc_intr_cpu
+#define	mrp_fanout_mode	mrp_cpus.mc_fanout_mode
+
+#define	MAC_COPY_CPUS(mrp, fmrp) {					\
+	int	ncpus;							\
+	(fmrp)->mrp_ncpus = (mrp)->mrp_ncpus;				\
+	(fmrp)->mrp_intr_cpu = (mrp)->mrp_intr_cpu;			\
+	(fmrp)->mrp_fanout_mode = (mrp)->mrp_fanout_mode;		\
+	if ((mrp)->mrp_ncpus == 0) {					\
+		(fmrp)->mrp_mask &= ~MRP_CPUS;				\
+		(fmrp)->mrp_mask &= ~MRP_CPUS_USERSPEC;			\
+	} else {							\
+		for (ncpus = 0; ncpus < (fmrp)->mrp_ncpus; ncpus++)	\
+			(fmrp)->mrp_cpu[ncpus] = (mrp)->mrp_cpu[ncpus];\
+		(fmrp)->mrp_mask |= MRP_CPUS;				\
+		if ((mrp)->mrp_mask & MRP_CPUS_USERSPEC)		\
+			(fmrp)->mrp_mask |= MRP_CPUS_USERSPEC;		\
+	}								\
+}
+
+typedef struct flow_stats_s {
+	uint64_t	fs_rbytes;
+	uint64_t	fs_ipackets;
+	uint64_t	fs_ierrors;
+	uint64_t	fs_obytes;
+	uint64_t	fs_opackets;
+	uint64_t	fs_oerrors;
+} flow_stats_t;
+
+typedef enum {
+	FLOW_STAT_RBYTES,
+	FLOW_STAT_IPACKETS,
+	FLOW_STAT_IERRORS,
+	FLOW_STAT_OBYTES,
+	FLOW_STAT_OPACKETS,
+	FLOW_STAT_OERRORS
+} flow_stat_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _MAC_FLOW_H */
diff --git a/usr/src/uts/common/sys/mac_flow_impl.h b/usr/src/uts/common/sys/mac_flow_impl.h
new file mode 100644
index 0000000000..6029873930
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_flow_impl.h
@@ -0,0 +1,537 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_MAC_FLOW_IMPL_H
+#define	_MAC_FLOW_IMPL_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>
+#include <sys/atomic.h>
+#include <sys/ksynch.h>
+#include <sys/mac_flow.h>
+#include <sys/stream.h>
+#include <sys/sdt.h>
+#include <net/if.h>
+
+/*
+ * Macros to increment/decrement the reference count on a flow_entry_t.
+ */
+#define	FLOW_REFHOLD(flent) {					\
+	DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent));	\
+	mutex_enter(&(flent)->fe_lock);				\
+	(flent)->fe_refcnt++;					\
+	mutex_exit(&(flent)->fe_lock);				\
+}
+
+/*
+ * Data paths must not attempt to use a flow entry if it is marked INCIPIENT
+ * or QUIESCE. In the former case the set up is not yet complete and the
+ * data path could stumble on inconsistent data structures. In the latter
+ * case a control operation is waiting for quiescence so that it can
+ * change callbacks or other structures without the use of locks.
+ */
+#define	FLOW_TRY_REFHOLD(flent, err) {				\
+	DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent));	\
+	(err) = 0;						\
+	mutex_enter(&(flent)->fe_lock);				\
+	if ((flent)->fe_flags & (FE_INCIPIENT | FE_QUIESCE | FE_CONDEMNED | \
+	    FE_UF_NO_DATAPATH | FE_MC_NO_DATAPATH))			\
+		(err) = -1;					\
+	else							\
+		(flent)->fe_refcnt++;				\
+	mutex_exit(&(flent)->fe_lock);				\
+}
+
+#define	FLOW_REFRELE(flent) {					\
+	DTRACE_PROBE1(flow_refrele, flow_entry_t *, (flent));	\
+	mutex_enter(&(flent)->fe_lock);				\
+	ASSERT((flent)->fe_refcnt != 0);			\
+	(flent)->fe_refcnt--;					\
+	if ((flent)->fe_flags & FE_WAITER) {			\
+		ASSERT((flent)->fe_refcnt != 0);		\
+		cv_signal(&(flent)->fe_cv);			\
+		mutex_exit(&(flent)->fe_lock);			\
+	} else if ((flent)->fe_refcnt == 0) {			\
+		mac_flow_destroy(flent);			\
+	} else {						\
+		mutex_exit(&(flent)->fe_lock);			\
+	}							\
+}
+
+#define	FLOW_USER_REFHOLD(flent) {			\
+	mutex_enter(&(flent)->fe_lock);			\
+	(flent)->fe_user_refcnt++;			\
+	mutex_exit(&(flent)->fe_lock);			\
+}
+
+#define	FLOW_USER_REFRELE(flent) {			\
+	mutex_enter(&(flent)->fe_lock);			\
+	ASSERT((flent)->fe_user_refcnt != 0);		\
+	if (--(flent)->fe_user_refcnt == 0 &&		\
+	    ((flent)->fe_flags & FE_WAITER))		\
+		cv_signal(&(flent)->fe_cv);		\
+	mutex_exit(&(flent)->fe_lock);			\
+}
+
+#define	FLOW_FINAL_REFRELE(flent) {			\
+	ASSERT(flent->fe_refcnt == 1 && flent->fe_user_refcnt == 0);	\
+	FLOW_REFRELE(flent);				\
+}
+
+/*
+ * Mark or unmark the flent with a bit flag
+ */
+#define	FLOW_MARK(flent, flag) {		\
+	mutex_enter(&(flent)->fe_lock);		\
+	(flent)->fe_flags |= flag;		\
+	mutex_exit(&(flent)->fe_lock);		\
+}
+
+#define	FLOW_UNMARK(flent, flag) {		\
+	mutex_enter(&(flent)->fe_lock);		\
+	(flent)->fe_flags &= ~flag;		\
+	mutex_exit(&(flent)->fe_lock);		\
+}
+
+#define	FLENT_TO_MIP(flent)			\
+	(flent->fe_mbg != NULL ? mac_bcast_grp_mip(flent->fe_mbg) :	\
+	((mac_client_impl_t *)flent->fe_mcip)->mci_mip)
+
+/* Convert a bandwidth expressed in bps to a number of bytes per tick. */
+#define	FLOW_BYTES_PER_TICK(bps)	(((bps) >> 3) / hz)
+
+/*
+ * Given an underlying range and a priority level, obtain the minimum for the
+ * new range.
+ */
+#define	FLOW_MIN_PRIORITY(min, max, pri)	\
+	((min) + ((((max) - (min)) / MRP_PRIORITY_LEVELS) * (pri)))
+
+/*
+ * Given an underlying range and a minimum level (base), obtain the maximum
+ * for the new range.
+ */
+#define	FLOW_MAX_PRIORITY(min, max, base)	\
+	((base) + (((max) - (min)) / MRP_PRIORITY_LEVELS))
+
+/*
+ * Given an underlying range and a priority level, get the absolute
+ * priority value. For now there are just 3 values, high, low and
+ * medium  so we can just return max, min or min + (max - min) / 2.
+ * If there are more than three we need to change this computation.
+ */
+#define	FLOW_PRIORITY(min, max, pri)		\
+	(pri) == MPL_HIGH ? (max) :	\
+	(pri) == MPL_LOW ? (min) :	\
+	((min) + (((max) - (min)) / 2))
+
+#define	MAC_FLOW_TAB_SIZE		500
+
+typedef struct flow_entry_s		flow_entry_t;
+typedef struct flow_tab_s		flow_tab_t;
+typedef struct flow_state_s 		flow_state_t;
+struct mac_impl_s;
+struct mac_client_impl_s;
+
+/*
+ * Classification flags used to lookup the flow.
+ */
+#define	FLOW_INBOUND		0x01
+#define	FLOW_OUTBOUND		0x02
+/* Don't compare VID when classifying the packets, see mac_rx_classify() */
+#define	FLOW_IGNORE_VLAN	0x04
+
+/* Generic flow client function signature */
+typedef void		(*flow_fn_t)(void *, void *, mblk_t *, boolean_t);
+
+/* Flow state */
+typedef enum {
+	FLOW_DRIVER_UPCALL,
+	FLOW_USER_REF
+} mac_flow_state_t;
+
+/* Matches a flow_entry_t using the extracted flow_state_t info */
+typedef boolean_t	(*flow_match_fn_t)(flow_tab_t *, flow_entry_t *,
+			    flow_state_t *);
+
+/* fe_flags */
+#define	FE_QUIESCE		0x01	/* Quiesce the flow */
+#define	FE_WAITER		0x02	/* Flow has a waiter */
+#define	FE_FLOW_TAB		0x04	/* Flow is in the flow tab list */
+#define	FE_G_FLOW_HASH		0x08	/* Flow is in the global flow hash */
+#define	FE_INCIPIENT		0x10	/* Being setup */
+#define	FE_CONDEMNED		0x20	/* Being deleted */
+#define	FE_UF_NO_DATAPATH	0x40	/* No datapath setup for User flow */
+#define	FE_MC_NO_DATAPATH	0x80	/* No datapath setup for mac client */
+
+/* fe_type */
+#define	FLOW_PRIMARY_MAC	0x01 	/* NIC primary MAC address */
+#define	FLOW_VNIC_MAC		0x02	/* VNIC flow */
+#define	FLOW_MCAST		0x04	/* Multicast (and broadcast) */
+#define	FLOW_OTHER		0x08	/* Other flows configured */
+#define	FLOW_USER		0x10	/* User defined flow */
+#define	FLOW_VNIC		FLOW_VNIC_MAC
+#define	FLOW_NO_STATS		0x20	/* Don't create stats for the flow */
+
+/*
+ * Shared Bandwidth control counters between the soft ring set and its
+ * associated soft rings. In case the flow associated with NIC/VNIC
+ * has a group of Rx rings assigned to it, we have the same
+ * number of soft ring sets as we have the Rx ring in the group
+ * and each individual SRS (and its soft rings) decide when to
+ * poll their Rx ring independently. But if there is a B/W limit
+ * associated with the NIC/VNIC, then the B/W control counter is
+ * shared across all the SRS in the group and their associated
+ * soft rings.
+ *
+ * There is a many to 1 mapping between the SRS and
+ * mac_bw_ctl if the flow has a group of Rx rings associated with
+ * it.
+ */
+typedef struct mac_bw_ctl_s {
+	kmutex_t	mac_bw_lock;
+	uint32_t	mac_bw_state;
+	size_t		mac_bw_sz;	/* ?? Is it needed */
+	size_t		mac_bw_limit;	/* Max bytes to process per tick */
+	size_t		mac_bw_used;	/* Bytes processed in current tick */
+	size_t		mac_bw_drop_threshold; /* Max queue length */
+	size_t		mac_bw_drop_bytes;
+	size_t		mac_bw_polled;
+	size_t		mac_bw_intr;
+	clock_t		mac_bw_curr_time;
+} mac_bw_ctl_t;
+
+struct flow_entry_s {					/* Protected by */
+	struct flow_entry_s	*fe_next;		/* ft_lock */
+
+	datalink_id_t		fe_link_id;		/* WO */
+
+	/* Properties as specified for this flow */
+	mac_resource_props_t	fe_resource_props;	/* SL */
+
+	/* Properties actually effective at run time for this flow */
+	mac_resource_props_t	fe_effective_props;	/* SL */
+
+	kmutex_t		fe_lock;
+	char			fe_flow_name[MAXFLOWNAME];	/* fe_lock */
+	flow_desc_t		fe_flow_desc;		/* fe_lock */
+	kcondvar_t		fe_cv;			/* fe_lock */
+	/*
+	 * Initial flow ref is 1 on creation. A thread that lookups the
+	 * flent typically by a mac_flow_lookup() dynamically holds a ref.
+	 * If the ref is 1, it means there arent' any upcalls from the driver
+	 * or downcalls from the stack using this flent. Structures pointing
+	 * to the flent or flent inserted in lists don't count towards this
+	 * refcnt. Instead they are tracked using fe_flags. Only a control
+	 * thread doing a teardown operation deletes the flent, after waiting
+	 * for upcalls to finish synchronously. The fe_refcnt tracks
+	 * the number of upcall refs
+	 */
+	uint32_t		fe_refcnt;		/* fe_lock */
+
+	/*
+	 * This tracks lookups done using the global hash list for user
+	 * generated flows. This refcnt only protects the flent itself
+	 * from disappearing and helps walkers to read the flent info such
+	 * as flow spec. However the flent may be quiesced and the SRS could
+	 * be deleted. The fe_user_refcnt tracks the number of global flow
+	 * has refs.
+	 */
+	uint32_t		fe_user_refcnt;		/* fe_lock */
+	uint_t			fe_flags;		/* fe_lock */
+
+	/*
+	 * Function/args to invoke for delivering matching packets
+	 * Only the function ff_fn may be changed dynamically and atomically.
+	 * The ff_arg1 and ff_arg2 are set at creation time and may not
+	 * be changed.
+	 */
+	flow_fn_t		fe_cb_fn;		/* fe_lock */
+	void 			*fe_cb_arg1;		/* fe_lock */
+	void			*fe_cb_arg2;		/* fe_lock */
+
+	void			*fe_client_cookie;	/* WO */
+	void			*fe_rx_ring_group;	/* SL */
+	void			*fe_rx_srs[MAX_RINGS_PER_GROUP]; /* fe_lock */
+	int			fe_rx_srs_cnt;		/* fe_lock */
+	void			*fe_tx_srs;		/* WO */
+
+	/*
+	 * This is a unicast flow, and is a mac_client_impl_t
+	 */
+	void			*fe_mcip; 		/* WO */
+
+	/*
+	 * Used by mci_flent_list of mac_client_impl_t to track flows sharing
+	 * the same mac_client_impl_t.
+	 */
+	struct flow_entry_s	*fe_client_next;
+
+	/*
+	 * This is a broadcast or multicast flow and is a mac_bcast_grp_t
+	 */
+	void			*fe_mbg;		/* WO */
+	uint_t			fe_type;		/* WO */
+
+	/*
+	 * BW control info.
+	 */
+	mac_bw_ctl_t		fe_tx_bw;
+	mac_bw_ctl_t		fe_rx_bw;
+
+	/*
+	 * Used by flow table lookup code
+	 */
+	flow_match_fn_t		fe_match;
+
+	/*
+	 * Used by mac_flow_remove().
+	 */
+	int			fe_index;
+	flow_tab_t		*fe_flow_tab;
+
+	kstat_t			*fe_ksp;
+	flow_stats_t		fe_flowstats;
+	boolean_t		fe_desc_logged;
+	zoneid_t		fe_zoneid;
+	uint64_t		fe_nic_speed;
+};
+
+/*
+ * Various structures used by the flows framework for keeping track
+ * of packet state information.
+ */
+
+/* Layer 2 */
+typedef struct flow_l2info_s {
+	uchar_t		*l2_start;
+	uint8_t		*l2_daddr;
+	uint16_t	l2_vid;
+	uint32_t	l2_sap;
+	uint_t		l2_hdrsize;
+} flow_l2info_t;
+
+/* Layer 3 */
+typedef struct flow_l3info_s {
+	uchar_t		*l3_start;
+	uint8_t		l3_protocol;
+	uint8_t		l3_version;
+	boolean_t	l3_dst_or_src;
+	uint_t		l3_hdrsize;
+	boolean_t	l3_fragmented;
+} flow_l3info_t;
+
+/* Layer 4 */
+typedef struct flow_l4info_s {
+	uchar_t		*l4_start;
+	uint16_t	l4_src_port;
+	uint16_t	l4_dst_port;
+	uint16_t	l4_hash_port;
+} flow_l4info_t;
+
+/*
+ * Combined state structure.
+ * Holds flow direction and an mblk_t pointer.
+ */
+struct flow_state_s {
+	uint_t		fs_flags;
+	mblk_t		*fs_mp;
+	flow_l2info_t	fs_l2info;
+	flow_l3info_t	fs_l3info;
+	flow_l4info_t	fs_l4info;
+};
+
+/*
+ * Flow ops vector.
+ * There are two groups of functions. The ones ending with _fe are
+ * called when a flow is being added. The others (hash, accept) are
+ * called at flow lookup time.
+ */
+#define	FLOW_MAX_ACCEPT	16
+typedef struct flow_ops_s {
+	/*
+	 * fo_accept_fe():
+	 * Validates the contents of the flow and checks whether
+	 * it's compatible with the flow table. sets the fe_match
+	 * function of the flow.
+	 */
+	int		(*fo_accept_fe)(flow_tab_t *, flow_entry_t *);
+	/*
+	 * fo_hash_fe():
+	 * Generates a hash index to the flow table. This function
+	 * must use the same algorithm as fo_hash(), which is used
+	 * by the flow lookup code path.
+	 */
+	uint32_t	(*fo_hash_fe)(flow_tab_t *, flow_entry_t *);
+	/*
+	 * fo_match_fe():
+	 * This is used for finding identical flows.
+	 */
+	boolean_t	(*fo_match_fe)(flow_tab_t *, flow_entry_t *,
+			    flow_entry_t *);
+	/*
+	 * fo_insert_fe():
+	 * Used for inserting a flow to a flow chain.
+	 * Protocols that have special ordering requirements would
+	 * need to implement this. For those that don't,
+	 * flow_generic_insert_fe() may be used.
+	 */
+	int		(*fo_insert_fe)(flow_tab_t *, flow_entry_t **,
+			    flow_entry_t *);
+
+	/*
+	 * Calculates the flow hash index based on the accumulated
+	 * state in flow_state_t. Must use the same algorithm as
+	 * fo_hash_fe().
+	 */
+	uint32_t	(*fo_hash)(flow_tab_t *, flow_state_t *);
+
+	/*
+	 * Array of accept fuctions.
+	 * Each function in the array will accumulate enough state
+	 * (header length, protocol) to allow the next function to
+	 * proceed. We support up to FLOW_MAX_ACCEPT functions which
+	 * should be sufficient for all practical purposes.
+	 */
+	int		(*fo_accept[FLOW_MAX_ACCEPT])(flow_tab_t *,
+			    flow_state_t *);
+} flow_ops_t;
+
+/*
+ * Generic flow table.
+ */
+struct flow_tab_s {
+	krwlock_t		ft_lock;
+	/*
+	 * Contains a list of functions (described above)
+	 * specific to this table type.
+	 */
+	flow_ops_t		ft_ops;
+
+	/*
+	 * Indicates what types of flows are supported.
+	 */
+	flow_mask_t		ft_mask;
+
+	/*
+	 * An array of flow_entry_t * of size ft_size.
+	 * Each element is the beginning of a hash chain.
+	 */
+	flow_entry_t		**ft_table;
+	uint_t			ft_size;
+
+	/*
+	 * The number of flows inserted into ft_table.
+	 */
+	uint_t			ft_flow_count;
+	struct mac_impl_s	*ft_mip;
+	struct mac_client_impl_s	*ft_mcip;
+};
+
+/*
+ * This is used for describing what type of flow table can be created.
+ * mac_flow.c contains a list of these structures.
+ */
+typedef struct flow_tab_info_s {
+	flow_ops_t		*fti_ops;
+	flow_mask_t		fti_mask;
+	uint_t			fti_size;
+} flow_tab_info_t;
+
+#define	FLOW_TAB_EMPTY(ft)	((ft) == NULL || (ft)->ft_flow_count == 0)
+
+/*
+ * This is used by mac_tx_send.
+ */
+typedef struct mac_tx_stats_s {
+	uint_t			ts_opackets;
+	uint_t			ts_obytes;
+	uint_t			ts_oerrors;
+} mac_tx_stats_t;
+
+#define	FLOW_STAT_UPDATE(f, s, c)  {					\
+	((flow_entry_t *)(f))->fe_flowstats.fs_##s += ((uint64_t)(c));	\
+}
+
+#define	FLOW_TX_STATS_UPDATE(f, s) {					\
+	FLOW_STAT_UPDATE((f), opackets, (s)->ts_opackets);		\
+	FLOW_STAT_UPDATE((f), obytes, (s)->ts_obytes);			\
+	FLOW_STAT_UPDATE((f), oerrors, (s)->ts_oerrors);		\
+}
+
+extern void	mac_flow_init();
+extern void	mac_flow_fini();
+extern int	mac_flow_create(flow_desc_t *, mac_resource_props_t *,
+		    char *, void *, uint_t, flow_entry_t **);
+
+extern int	mac_flow_add(flow_tab_t *, flow_entry_t *);
+extern int	mac_flow_add_subflow(mac_client_handle_t, flow_entry_t *,
+		    boolean_t);
+extern int	mac_flow_hash_add(flow_entry_t *);
+extern int	mac_flow_lookup_byname(char *, flow_entry_t **);
+extern int	mac_flow_lookup(flow_tab_t *, mblk_t *, uint_t,
+		    flow_entry_t **);
+
+extern int	mac_flow_walk(flow_tab_t *, int (*)(flow_entry_t *, void *),
+		    void *);
+
+extern int	mac_flow_walk_nolock(flow_tab_t *,
+		    int (*)(flow_entry_t *, void *), void *);
+
+extern void	mac_flow_modify(flow_tab_t *, flow_entry_t *,
+		    mac_resource_props_t *);
+
+extern void	*mac_flow_get_client_cookie(flow_entry_t *);
+
+extern uint32_t	mac_flow_modify_props(flow_entry_t *, mac_resource_props_t *);
+
+extern int	mac_flow_update(flow_tab_t *, flow_entry_t *, flow_desc_t *);
+extern void	mac_flow_get_desc(flow_entry_t *, flow_desc_t *);
+extern void	mac_flow_set_desc(flow_entry_t *, flow_desc_t *);
+
+extern void	mac_flow_remove(flow_tab_t *, flow_entry_t *, boolean_t);
+extern void	mac_flow_hash_remove(flow_entry_t *);
+extern void	mac_flow_wait(flow_entry_t *, mac_flow_state_t);
+extern void	mac_flow_quiesce(flow_entry_t *);
+extern void	mac_flow_restart(flow_entry_t *);
+extern void	mac_flow_cleanup(flow_entry_t *);
+extern void	mac_flow_destroy(flow_entry_t *);
+
+extern void	mac_flow_tab_create(flow_ops_t *, flow_mask_t, uint_t,
+		    struct mac_impl_s *, flow_tab_t **);
+extern void	mac_flow_l2tab_create(struct mac_impl_s *, flow_tab_t **);
+extern void	mac_flow_tab_destroy(flow_tab_t *);
+extern void	mac_flow_drop(void *, void *, mblk_t *);
+extern void	flow_stat_destroy(flow_entry_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _MAC_FLOW_IMPL_H */
diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h
index 6b36a978f0..9c8bfb7ce9 100644
--- a/usr/src/uts/common/sys/mac_impl.h
+++ b/usr/src/uts/common/sys/mac_impl.h
@@ -26,23 +26,17 @@
 #ifndef	_SYS_MAC_IMPL_H
 #define	_SYS_MAC_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/mac.h>
+#include <sys/modhash.h>
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
 #include <net/if.h>
+#include <sys/mac_flow_impl.h>
+#include <netinet/ip6.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
-typedef struct mac_multicst_addr_s	mac_multicst_addr_t;
-
-struct mac_multicst_addr_s {
-	mac_multicst_addr_t	*mma_nextp;
-	uint_t			mma_ref;
-	uint8_t			mma_addr[MAXMACADDRLEN];
-};
-
 typedef struct mac_margin_req_s	mac_margin_req_t;
 
 struct mac_margin_req_s {
@@ -51,31 +45,85 @@ struct mac_margin_req_s {
 	uint32_t		mmr_margin;
 };
 
-typedef struct mac_notify_fn_s		mac_notify_fn_t;
+/* Generic linked chain type */
+typedef	struct mac_chain_s {
+	struct mac_chain_s	*next;
+	void			*item;
+} mac_chain_t;
 
-struct mac_notify_fn_s {
-	mac_notify_fn_t		*mnf_nextp;
-	mac_notify_t		mnf_fn;
-	void			*mnf_arg;
-};
+/*
+ * Generic mac callback list manipulation structures and macros. The mac_cb_t
+ * represents a general callback list element embedded in a particular
+ * data structure such as a mac_notify_cb_t or a mac_promisc_impl_t.
+ * The mac_cb_info_t represents general information about list walkers.
+ * Please see the comments above mac_callback_add for more information.
+ */
+/* mcb_flags */
+#define	MCB_CONDEMNED		0x1		/* Logically deleted */
+#define	MCB_NOTIFY_CB_T		0x2
+#define	MCB_TX_NOTIFY_CB_T	0x4
+
+typedef struct mac_cb_s {
+	struct mac_cb_s		*mcb_nextp;	/* Linked list of callbacks */
+	void			*mcb_objp;	/* Ptr to enclosing object  */
+	size_t			mcb_objsize;	/* Sizeof the enclosing obj */
+	uint_t			mcb_flags;
+} mac_cb_t;
+
+typedef struct mac_cb_info_s {
+	kmutex_t	*mcbi_lockp;
+	kcondvar_t	mcbi_cv;
+	uint_t		mcbi_del_cnt;		/* Deleted callback cnt */
+	uint_t		mcbi_walker_cnt;	/* List walker count */
+} mac_cb_info_t;
+
+typedef struct mac_notify_cb_s {
+	mac_cb_t	mncb_link;		/* Linked list of callbacks */
+	mac_notify_t	mncb_fn;		/* callback function */
+	void		*mncb_arg;		/* callback argument */
+	struct mac_impl_s *mncb_mip;
+} mac_notify_cb_t;
 
-typedef struct mac_rx_fn_s		mac_rx_fn_t;
+/*
+ * mac_callback_add(listinfo, listhead, listelement)
+ * mac_callback_remove(listinfo, listhead, listelement)
+ */
+typedef boolean_t (*mcb_func_t)(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
 
-struct mac_rx_fn_s {
-	mac_rx_fn_t		*mrf_nextp;
-	mac_rx_t		mrf_fn;
-	void			*mrf_arg;
-	boolean_t		mrf_inuse;
-	boolean_t		mrf_active;
-};
+#define	MAC_CALLBACK_WALKER_INC(mcbi) {				\
+	mutex_enter((mcbi)->mcbi_lockp);			\
+	(mcbi)->mcbi_walker_cnt++;				\
+	mutex_exit((mcbi)->mcbi_lockp);				\
+}
 
-typedef struct mac_txloop_fn_s		mac_txloop_fn_t;
+#define	MAC_CALLBACK_WALKER_INC_HELD(mcbi)	(mcbi)->mcbi_walker_cnt++;
 
-struct mac_txloop_fn_s {
-	mac_txloop_fn_t		*mtf_nextp;
-	mac_txloop_t		mtf_fn;
-	void			*mtf_arg;
-};
+#define	MAC_CALLBACK_WALKER_DCR(mcbi, headp) {			\
+	mac_cb_t	*rmlist;				\
+								\
+	mutex_enter((mcbi)->mcbi_lockp);			\
+	if (--(mcbi)->mcbi_walker_cnt == 0 && (mcbi)->mcbi_del_cnt != 0) { \
+		rmlist = mac_callback_walker_cleanup((mcbi), headp);	\
+		mac_callback_free(rmlist);			\
+		cv_broadcast(&(mcbi)->mcbi_cv);			\
+	}							\
+	mutex_exit((mcbi)->mcbi_lockp);				\
+}
+
+#define	MAC_PROMISC_WALKER_INC(mip)				\
+	MAC_CALLBACK_WALKER_INC(&(mip)->mi_promisc_cb_info)
+
+#define	MAC_PROMISC_WALKER_DCR(mip) {				\
+	mac_cb_info_t	*mcbi;					\
+								\
+	mcbi = &(mip)->mi_promisc_cb_info;			\
+	mutex_enter(mcbi->mcbi_lockp);				\
+	if (--mcbi->mcbi_walker_cnt == 0 && mcbi->mcbi_del_cnt != 0) { \
+		i_mac_promisc_walker_cleanup(mip);		\
+		cv_broadcast(&mcbi->mcbi_cv);			\
+	}							\
+	mutex_exit(mcbi->mcbi_lockp);				\
+}
 
 typedef struct mactype_s {
 	const char	*mt_ident;
@@ -91,118 +139,354 @@ typedef struct mactype_s {
 	size_t		mt_mappingcount;
 } mactype_t;
 
+/*
+ * Multiple rings implementation.
+ */
+typedef	enum {
+	MAC_GROUP_STATE_UNINIT	= 0,	/* initial state of data structure */
+	MAC_GROUP_STATE_REGISTERED,	/* hooked with h/w group */
+	MAC_GROUP_STATE_RESERVED,	/* group is reserved and opened */
+	MAC_GROUP_STATE_SHARED		/* default group shared among */
+					/* multiple mac clients */
+} mac_group_state_t;
+
+typedef	struct mac_ring_s mac_ring_t;
+typedef	struct mac_group_s mac_group_t;
+
+/*
+ * Ring data structure for ring control and management.
+ */
+typedef enum {
+	MR_FREE,		/* Available for assignment to flows */
+	MR_NEWLY_ADDED,		/* Just assigned to another group */
+	MR_INUSE		/* Assigned to an SRS */
+} mac_ring_state_t;
+
+/* mr_flag values */
+#define	MR_INCIPIENT	0x1
+#define	MR_CONDEMNED	0x2
+#define	MR_QUIESCE	0x4
+
+struct mac_ring_s {
+	int			mr_index;	/* index in the original list */
+	mac_ring_type_t		mr_type;	/* ring type */
+	mac_ring_t		*mr_next;	/* next ring in the chain */
+	mac_group_handle_t	mr_gh;		/* reference to group */
+
+	mac_classify_type_t	mr_classify_type;	/* HW vs SW */
+	struct mac_soft_ring_set_s *mr_srs;		/* associated SRS */
+	uint_t			mr_refcnt;		/* Ring references */
+	/* ring generation no. to guard against drivers using stale rings */
+	uint64_t		mr_gen_num;
+
+	kmutex_t		mr_lock;
+	kcondvar_t		mr_cv;			/* mr_lock */
+	mac_ring_state_t	mr_state;		/* mr_lock */
+	uint_t			mr_flag;		/* mr_lock */
+
+	mac_ring_info_t		mr_info;	/* driver supplied info */
+};
+#define	mr_driver		mr_info.mri_driver
+#define	mr_start		mr_info.mri_start
+#define	mr_stop			mr_info.mri_stop
+
+#define	MAC_RING_MARK(mr, flag)		\
+	(mr)->mr_flag |= flag;
 
-#define	MAC_VNIC_TXINFO_REFHOLD(mvt) {				\
-	mutex_enter(&(mvt)->mv_lock);				\
-	(mvt)->mv_refs++;					\
-	mutex_exit(&(mvt)->mv_lock);				\
+#define	MAC_RING_UNMARK(mr, flag)	\
+	(mr)->mr_flag &= ~flag;
+
+/*
+ * Reference hold and release on mac_ring_t 'mr'
+ */
+#define	MR_REFHOLD_LOCKED(mr)		{		\
+	ASSERT(MUTEX_HELD(&mr->mr_lock));		\
+	(mr)->mr_refcnt++;				\
 }
 
-#define	MAC_VNIC_TXINFO_REFRELE(mvt) {				\
-	mutex_enter(&(mvt)->mv_lock);				\
-	if (--(mvt)->mv_refs == 0 && (mvt)->mv_clearing) {	\
-	    (mvt)->mv_clearing = B_FALSE;			\
-	    cv_signal(&(mvt)->mv_cv);				\
-	}							\
-	mutex_exit(&(mvt)->mv_lock);				\
+#define	MR_REFRELE(mr)		{	 		\
+	mutex_enter(&(mr)->mr_lock);			\
+	ASSERT((mr)->mr_refcnt != 0);			\
+	(mr)->mr_refcnt--;				\
+	if ((mr)->mr_refcnt == 0 &&			\
+	    ((mr)->mr_flag & (MR_CONDEMNED | MR_QUIESCE))) \
+		cv_signal(&(mr)->mr_cv);		\
+	mutex_exit(&(mr)->mr_lock);			\
 }
 
-typedef struct mac_vnic_tx_s {
-	mac_txinfo_t	mv_txinfo;	/* provided by VNIC */
-	uint32_t	mv_refs;
-	kmutex_t	mv_lock;
-	kcondvar_t	mv_cv;
-	boolean_t	mv_clearing;
-} mac_vnic_tx_t;
+/*
+ * Per mac client flow information associated with a RX group.
+ * The entire structure is SL protected.
+ */
+typedef struct mac_grp_client {
+	struct mac_grp_client		*mgc_next;
+	struct mac_client_impl_s	*mgc_client;
+} mac_grp_client_t;
+
+#define	MAC_RX_GROUP_NO_CLIENT(g)	((g)->mrg_clients == NULL)
 
+#define	MAC_RX_GROUP_ONLY_CLIENT(g)			\
+	((((g)->mrg_clients != NULL) &&			\
+	((g)->mrg_clients->mgc_next == NULL)) ?		\
+	(g)->mrg_clients->mgc_client : NULL)
 
 /*
- * Each registered MAC is associated with a mac_t structure.
+ * Common ring group data structure for ring control and management.
+ * The entire structure is SL protected
  */
-typedef struct mac_impl_s {
+struct mac_group_s {
+	int			mrg_index;	/* index in the list */
+	mac_ring_type_t		mrg_type;	/* ring type */
+	mac_group_state_t	mrg_state;	/* state of the group */
+	mac_group_t		*mrg_next;	/* next ring in the chain */
+	mac_handle_t		mrg_mh;		/* reference to MAC */
+	mac_ring_t		*mrg_rings;	/* grouped rings */
+	uint_t			mrg_cur_count;	/* actual size of group */
+
+	mac_grp_client_t	*mrg_clients;	/* clients list */
+
+	struct mac_client_impl_s *mrg_tx_client; /* TX client pointer */
+	mac_group_info_t	mrg_info;	/* driver supplied info */
+};
+
+#define	mrg_driver		mrg_info.mgi_driver
+#define	mrg_start		mrg_info.mgi_start
+#define	mrg_stop		mrg_info.mgi_stop
+
+#define	GROUP_INTR_HANDLE(g)		(g)->mrg_info.mgi_intr.mi_handle
+#define	GROUP_INTR_ENABLE_FUNC(g)	(g)->mrg_info.mgi_intr.mi_enable
+#define	GROUP_INTR_DISABLE_FUNC(g)	(g)->mrg_info.mgi_intr.mi_disable
+
+#define	MAC_DEFAULT_GROUP(mh)		(((mac_impl_t *)mh)->mi_rx_groups)
+
+#define	MAC_RING_TX_DEFAULT(mip, mp)			\
+	((mip->mi_default_tx_ring == NULL) ?		\
+	mip->mi_tx(mip->mi_driver, mp) :		\
+	mac_ring_tx(mip->mi_default_tx_ring, mp))
+
+#define	MAC_TX(mip, ring, mp, mcip) {					\
+	/*								\
+	 * If the MAC client has a bound Hybrid I/O share,		\
+	 * send the packet through the default tx ring, since		\
+	 * the tx rings of this client are now mapped in the		\
+	 * guest domain and not accessible from this domain.		\
+	 */								\
+	if (mcip->mci_share_bound || (ring == NULL))			\
+		mp = MAC_RING_TX_DEFAULT(mip, mp);			\
+	else								\
+		mp = mac_ring_tx(ring, mp);				\
+}
+
+/* mci_tx_flag */
+#define	MCI_TX_QUIESCE	0x1
+
+typedef struct mac_factory_addr_s {
+	boolean_t		mfa_in_use;
+	uint8_t			mfa_addr[MAXMACADDRLEN];
+	struct mac_client_impl_s	*mfa_client;
+} mac_factory_addr_t;
+
+typedef struct mac_mcast_addrs_s {
+	struct mac_mcast_addrs_s	*mma_next;
+	uint8_t				mma_addr[MAXMACADDRLEN];
+	int				mma_ref;
+} mac_mcast_addrs_t;
+
+typedef enum {
+	MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED = 1,	/* hardware steering */
+	MAC_ADDRESS_TYPE_UNICAST_PROMISC		/* promiscuous mode */
+} mac_address_type_t;
+
+typedef struct mac_impl_s mac_impl_t;
+
+typedef struct mac_address_s {
+	mac_address_type_t	ma_type;		/* address type */
+	int			ma_nusers;		/* number of users */
+							/* of that address */
+	struct mac_address_s	*ma_next;		/* next address */
+	uint8_t			ma_addr[MAXMACADDRLEN];	/* address value */
+	size_t			ma_len;			/* address length */
+	mac_group_t		*ma_group;		/* asscociated group */
+	mac_impl_t		*ma_mip;		/* MAC handle */
+} mac_address_t;
+
+extern krwlock_t i_mac_impl_lock;
+extern mod_hash_t *i_mac_impl_hash;
+extern kmem_cache_t *i_mac_impl_cachep;
+extern uint_t i_mac_impl_count;
+
+/*
+ * Each registered MAC is associated with a mac_impl_t structure. The
+ * structure represents the undelying hardware, in terms of definition,
+ * resources (transmit, receive rings etc.), callback functions etc. It
+ * also holds the table of MAC clients that are configured on the device.
+ * The table is used for classifying incoming packets in software.
+ *
+ * The protection scheme uses 2 elements, a coarse serialization mechanism
+ * called perimeter and a finer traditional lock based scheme. More details
+ * can be found in the big block comment in mac.c.
+ *
+ * The protection scheme for each member of the mac_impl_t is described below.
+ *
+ * Write Once Only (WO): Typically these don't change for the lifetime of the
+ * data structure. For example something in mac_impl_t that stays the same
+ * from mac_register to mac_unregister, or something in a mac_client_impl_t
+ * that stays the same from mac_client_open to mac_client_close.
+ *
+ * Serializer (SL): Protected by the Serializer. All SLOP operations on a
+ * mac endpoint go through the serializer. MTOPs don't care about reading
+ * these fields atomically.
+ *
+ * Lock: Traditional mutex/rw lock. Modify operations still go through the
+ * mac serializer, the lock helps synchronize readers with writers.
+ */
+struct mac_impl_s {
+	krwlock_t		mi_rw_lock;
+	char			mi_name[LIFNAMSIZ];	/* WO */
+	uint32_t		mi_state_flags;
+	void			*mi_driver;		/* Driver private, WO */
+	mac_info_t		mi_info;		/* WO */
+	mactype_t		*mi_type;		/* WO */
+	void			*mi_pdata;		/* WO */
+	size_t			mi_pdata_size;		/* WO */
+	mac_callbacks_t		*mi_callbacks;		/* WO */
+	dev_info_t		*mi_dip;		/* WO */
+	uint32_t		mi_ref;			/* i_mac_impl_lock */
+	uint_t			mi_active;		/* SL */
+	link_state_t		mi_linkstate;		/* none */
+	link_state_t		mi_lastlinkstate;	/* none */
+	uint_t			mi_promisc;		/* SL */
+	uint_t			mi_devpromisc;		/* SL */
+	kmutex_t		mi_lock;
+	uint8_t			mi_addr[MAXMACADDRLEN];	/* mi_rw_lock */
+	uint8_t			mi_dstaddr[MAXMACADDRLEN]; /* mi_rw_lock */
+
 	/*
-	 * The following fields are set in mac_register() and will not be
-	 * changed until mac_unregister(). No lock is needed to access them.
+	 * The mac perimeter. All client initiated create/modify operations
+	 * on a mac end point go through this.
 	 */
-	char			mi_name[LIFNAMSIZ];
-	void			*mi_driver;	/* Driver private data */
-	mac_info_t		mi_info;
-	mactype_t		*mi_type;
-	void			*mi_pdata;
-	size_t			mi_pdata_size;
-	mac_callbacks_t		*mi_callbacks;
-	dev_info_t		*mi_dip;
-	minor_t			mi_minor;
-	dev_t			mi_phy_dev;
-	kstat_t			*mi_ksp;
-	uint_t			mi_kstat_count;
-	mac_txinfo_t		mi_txinfo;
-	mac_txinfo_t		mi_txloopinfo;
-
-	krwlock_t		mi_gen_lock;
-	uint32_t		mi_oref;
-	uint32_t		mi_ref;
-	boolean_t		mi_disabled;
-	boolean_t		mi_exclusive;
-
-	krwlock_t		mi_state_lock;
-	uint_t			mi_active;
-
-	krwlock_t		mi_data_lock;
-	link_state_t		mi_linkstate;
-	link_state_t		mi_lastlinkstate;
-	uint_t			mi_promisc;
-	uint_t			mi_devpromisc;
-	uint8_t			mi_addr[MAXMACADDRLEN];
-	uint8_t			mi_dstaddr[MAXMACADDRLEN];
-	uint_t			mi_sdu_min;
-	uint_t			mi_sdu_max;
-	mac_multicst_addr_t	*mi_mmap;
-
-	krwlock_t		mi_notify_lock;
-	uint32_t		mi_notify_bits;
-	kmutex_t		mi_notify_bits_lock;
-	kthread_t		*mi_notify_thread;
-	mac_notify_fn_t		*mi_mnfp;
-	kcondvar_t		mi_notify_cv;
-
-	krwlock_t		mi_rx_lock;
-	mac_rx_fn_t		*mi_mrfp;
-	krwlock_t		mi_tx_lock;
-	mac_txloop_fn_t		*mi_mtfp;
-
-	krwlock_t		mi_resource_lock;
-	mac_resource_add_t	mi_resource_add;
-	void			*mi_resource_add_arg;
-
-	kmutex_t		mi_activelink_lock;
-	boolean_t		mi_activelink;
-
-	uint32_t		mi_rx_ref;	/* #threads in mac_rx() */
-	uint32_t		mi_rx_removed;	/* #callbacks marked */
-						/* for removal */
-	kmutex_t		mi_lock;
-	kcondvar_t		mi_rx_cv;
-	boolean_t		mi_shareable;
-	boolean_t		mi_vnic_present;
-	mac_vnic_tx_t		*mi_vnic_tx;
-	mac_txinfo_t		mi_vnic_txinfo;
-	mac_txinfo_t		mi_vnic_txloopinfo;
-	mac_getcapab_t		mi_vnic_getcapab_fn;
-	void			*mi_vnic_getcapab_arg;
-
-	boolean_t		mi_legacy;
-	uint32_t		mi_unsup_note;
-	uint32_t		mi_margin;
+	kmutex_t		mi_perim_lock;
+	kthread_t		*mi_perim_owner;	/* mi_perim_lock */
+	uint_t			mi_perim_ocnt;		/* mi_perim_lock */
+	kcondvar_t		mi_perim_cv;		/* mi_perim_lock */
+
+	/* mac notification callbacks */
+	kmutex_t		mi_notify_lock;
+	mac_cb_info_t		mi_notify_cb_info;	/* mi_notify_lock */
+	mac_cb_t		*mi_notify_cb_list;	/* mi_notify_lock */
+	kthread_t		*mi_notify_thread;	/* mi_notify_lock */
+	uint_t			mi_notify_bits;		/* mi_notify_lock */
+
+	uint32_t		mi_v12n_level;		/* Virt'ion readiness */
 
 	/*
+	 * RX groups, ring capability
+	 * Fields of this block are SL protected.
+	 */
+	mac_group_type_t	mi_rx_group_type;	/* grouping type */
+	uint_t			mi_rx_group_count;
+	mac_group_t		*mi_rx_groups;
+
+	mac_capab_rings_t	mi_rx_rings_cap;
+
+	/*
+	 * TX groups and ring capability, SL Protected.
+	 */
+	mac_group_type_t	mi_tx_group_type;	/* grouping type */
+	uint_t			mi_tx_group_count;
+	uint_t			mi_tx_group_free;
+	mac_group_t		*mi_tx_groups;
+
+	mac_capab_rings_t	mi_tx_rings_cap;
+
+	mac_ring_handle_t	mi_default_tx_ring;
+
+	/*
+	 * MAC address list. SL protected.
+	 */
+	mac_address_t		*mi_addresses;
+
+	/*
+	 * This MAC's table of sub-flows
+	 */
+	flow_tab_t		*mi_flow_tab;		/* WO */
+
+	kstat_t			*mi_ksp;		/* WO */
+	uint_t			mi_kstat_count;		/* WO */
+	uint_t			mi_nactiveclients;	/* SL */
+
+	/* for broadcast and multicast support */
+	struct mac_mcast_addrs_s *mi_mcast_addrs;	/* mi_rw_lock */
+	struct mac_bcast_grp_s *mi_bcast_grp;		/* mi_rw_lock */
+	uint_t			mi_bcast_ngrps;		/* mi_rw_lock */
+
+	/* list of MAC clients which opened this MAC */
+	struct mac_client_impl_s *mi_clients_list;	/* mi_rw_lock */
+	uint_t			mi_nclients;		/* mi_rw_lock */
+
+	uint32_t		mi_margin;		/* mi_rw_lock */
+	uint_t			mi_sdu_min;		/* mi_rw_lock */
+	uint_t			mi_sdu_max;		/* mi_rw_lock */
+
+	/*
+	 * Cache of factory MAC addresses provided by the driver. If
+	 * the driver doesn't provide multiple factory MAC addresses,
+	 * the mi_factory_addr is set to NULL, and mi_factory_addr_num
+	 * is set to zero.
+	 */
+	mac_factory_addr_t	*mi_factory_addr;	/* mi_rw_lock */
+	uint_t			mi_factory_addr_num;	/* mi_rw_lock */
+
+	/* for promiscuous mode support */
+	kmutex_t		mi_promisc_lock;
+	mac_cb_t		*mi_promisc_list;	/* mi_promisc_lock */
+	mac_cb_info_t		mi_promisc_cb_info;	/* mi_promisc_lock */
+
+	/* cache of rings over this mac_impl */
+	kmutex_t		mi_ring_lock;
+	mac_ring_t		*mi_ring_freelist;	/* mi_ring_lock */
+
+	/*
+	 * These are used for caching the properties, if any, for the
+	 * primary MAC client. If the MAC client is not yet in place
+	 * when the properties are set then we cache them here to be
+	 * applied to the MAC client when it is created.
+	 */
+	mac_resource_props_t	mi_resource_props;	/* SL */
+
+	minor_t			mi_minor;		/* WO */
+	dev_t			mi_phy_dev;		/* WO */
+	uint32_t		mi_oref;		/* SL */
+	uint32_t		mi_unsup_note;		/* WO */
+	/*
 	 * List of margin value requests added by mac clients. This list is
 	 * sorted: the first one has the greatest value.
 	 */
 	mac_margin_req_t	*mi_mmrp;
 	mac_priv_prop_t		*mi_priv_prop;
 	uint_t			mi_priv_prop_count;
-} mac_impl_t;
+
+	/*
+	 * Hybrid I/O related definitions.
+	 */
+	mac_capab_share_t	mi_share_capab;
+
+/* This should be the last block in this structure */
+#ifdef DEBUG
+#define	MAC_PERIM_STACK_DEPTH	15
+	int			mi_perim_stack_depth;
+	pc_t			mi_perim_stack[MAC_PERIM_STACK_DEPTH];
+#endif
+};
+
+/* for mi_state_flags */
+#define	MIS_DISABLED		0x0001
+#define	MIS_IS_VNIC		0x0002
+#define	MIS_IS_AGGR		0x0004
+#define	MIS_NOTIFY_DONE		0x0008
+#define	MIS_EXCLUSIVE		0x0010
+#define	MIS_EXCLUSIVE_HELD	0x0020
+#define	MIS_LEGACY		0x0040
 
 #define	mi_getstat	mi_callbacks->mc_getstat
 #define	mi_start	mi_callbacks->mc_start
@@ -212,19 +496,193 @@ typedef struct mac_impl_s {
 #define	mi_setpromisc	mi_callbacks->mc_setpromisc
 #define	mi_multicst	mi_callbacks->mc_multicst
 #define	mi_unicst	mi_callbacks->mc_unicst
-#define	mi_resources	mi_callbacks->mc_resources
 #define	mi_tx		mi_callbacks->mc_tx
 #define	mi_ioctl	mi_callbacks->mc_ioctl
 #define	mi_getcapab	mi_callbacks->mc_getcapab
 
+typedef struct mac_notify_task_arg {
+	mac_impl_t		*mnt_mip;
+	mac_notify_type_t	mnt_type;
+	mac_ring_t		*mnt_ring;
+} mac_notify_task_arg_t;
+
+typedef enum {
+	MAC_RX_NO_RESERVE,
+	MAC_RX_RESERVE_DEFAULT,
+	MAC_RX_RESERVE_NONDEFAULT
+} mac_rx_group_reserve_type_t;
+
+/*
+ * XXX All MAC_DBG_PRTs must be replaced with call to dtrace probes. For now
+ * it may be easier to have these printfs for easier debugging
+ */
+#ifdef DEBUG
+extern int mac_dbg;
+#define	MAC_DBG_PRT(a)	if (mac_dbg > 0) {(void) printf a; }
+#else
+#define	MAC_DBG_PRT(a)
+#endif
+
+/*
+ * The mac_perim_handle_t is an opaque type that encodes the 'mip' pointer
+ * and whether internally a mac_open was done when acquiring the perimeter.
+ */
+#define	MAC_ENCODE_MPH(mph, mh, need_close)		\
+	(mph) = (mac_perim_handle_t)((uintptr_t)(mh) | need_close)
+
+#define	MAC_DECODE_MPH(mph, mip, need_close) {		\
+	mip = (mac_impl_t *)(((uintptr_t)mph) & ~0x1);	\
+	(need_close) = ((uintptr_t)mph & 0x1);		\
+}
+
+typedef struct mac_client_impl_s mac_client_impl_t;
+
 extern void	mac_init(void);
 extern int	mac_fini(void);
 
 extern void	mac_stat_create(mac_impl_t *);
 extern void	mac_stat_destroy(mac_impl_t *);
 extern uint64_t	mac_stat_default(mac_impl_t *, uint_t);
+extern void	mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *);
+extern void	mac_create_soft_ring_kstats(mac_impl_t *, int32_t);
+extern boolean_t mac_ip_hdr_length_v6(mblk_t *, ip6_t *, uint16_t *,
+    uint8_t *);
+
+extern mblk_t *mac_copymsgchain_cksum(mblk_t *);
+extern mblk_t *mac_fix_cksum(mblk_t *);
+extern void mac_packet_print(mac_handle_t, mblk_t *);
+extern void mac_rx_deliver(void *, mac_resource_handle_t, mblk_t *,
+    mac_header_info_t *);
+extern void mac_tx_notify(mac_impl_t *);
+
+extern	boolean_t mac_callback_find(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
+extern	void	mac_callback_add(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
+extern	boolean_t mac_callback_remove(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
+extern	void	mac_callback_remove_wait(mac_cb_info_t *);
+extern	void	mac_callback_free(mac_cb_t *);
+extern	mac_cb_t *mac_callback_walker_cleanup(mac_cb_info_t *, mac_cb_t **);
+
+/* in mac_bcast.c */
+extern void mac_bcast_init(void);
+extern void mac_bcast_fini(void);
+extern mac_impl_t *mac_bcast_grp_mip(void *);
+extern int mac_bcast_add(mac_client_impl_t *, const uint8_t *, uint16_t,
+    mac_addrtype_t);
+extern void mac_bcast_delete(mac_client_impl_t *, const uint8_t *, uint16_t);
+extern void mac_bcast_send(void *, void *, mblk_t *, boolean_t);
+extern void mac_bcast_grp_free(void *);
+extern void mac_bcast_refresh(mac_impl_t *, mac_multicst_t, void *,
+    boolean_t);
+extern void mac_client_bcast_refresh(mac_client_impl_t *, mac_multicst_t,
+    void *, boolean_t);
 
-extern void mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *);
+/*
+ * Grouping functions are used internally by MAC layer.
+ */
+extern int mac_group_addmac(mac_group_t *, const uint8_t *);
+extern int mac_group_remmac(mac_group_t *, const uint8_t *);
+extern int mac_rx_group_add_flow(mac_client_impl_t *, flow_entry_t *,
+    mac_group_t *);
+extern mblk_t *mac_ring_tx(mac_ring_handle_t, mblk_t *);
+extern mac_ring_t *mac_reserve_tx_ring(mac_impl_t *, mac_ring_t *);
+extern void mac_release_tx_ring(mac_ring_handle_t);
+extern mac_group_t *mac_reserve_tx_group(mac_impl_t *, mac_share_handle_t);
+extern void mac_release_tx_group(mac_impl_t *, mac_group_t *);
+
+/*
+ * MAC address functions are used internally by MAC layer.
+ */
+extern mac_address_t *mac_find_macaddr(mac_impl_t *, uint8_t *);
+extern boolean_t mac_check_macaddr_shared(mac_address_t *);
+extern int mac_update_macaddr(mac_address_t *, uint8_t *);
+extern void mac_freshen_macaddr(mac_address_t *, uint8_t *);
+extern void mac_retrieve_macaddr(mac_address_t *, uint8_t *);
+extern void mac_init_macaddr(mac_impl_t *);
+extern void mac_fini_macaddr(mac_impl_t *);
+
+/*
+ * Flow construction/destruction routines.
+ * Not meant to be used by mac clients.
+ */
+extern int mac_link_flow_init(mac_client_handle_t, flow_entry_t *);
+extern void mac_link_flow_clean(mac_client_handle_t, flow_entry_t *);
+
+/*
+ * Called from mac_provider.c
+ */
+extern void mac_fanout_recompute(mac_impl_t *);
+
+/*
+ * The following functions are used internally by the MAC layer to
+ * add/remove/update flows associated with a mac_impl_t. They should
+ * never be used directly by MAC clients.
+ */
+extern int mac_datapath_setup(mac_client_impl_t *, flow_entry_t *, uint32_t);
+extern void mac_datapath_teardown(mac_client_impl_t *, flow_entry_t *,
+    uint32_t);
+extern void mac_srs_group_setup(mac_client_impl_t *, flow_entry_t *,
+    mac_group_t *, uint32_t);
+extern void mac_srs_group_teardown(mac_client_impl_t *, flow_entry_t *,
+	    uint32_t);
+extern int mac_rx_classify_flow_quiesce(flow_entry_t *, void *);
+extern int mac_rx_classify_flow_restart(flow_entry_t *, void *);
+extern void mac_tx_client_quiesce(mac_client_impl_t *, uint_t);
+extern void mac_tx_client_restart(mac_client_impl_t *);
+extern void mac_client_quiesce(mac_client_impl_t *);
+extern void mac_client_restart(mac_client_impl_t *);
+
+extern void mac_flow_update_priority(mac_client_impl_t *, flow_entry_t *);
+
+extern void mac_flow_rem_subflow(flow_entry_t *);
+extern void mac_rename_flow(flow_entry_t *, const char *);
+extern void mac_flow_set_name(flow_entry_t *, const char *);
+
+extern mblk_t *mac_add_vlan_tag(mblk_t *, uint_t, uint16_t);
+extern mblk_t *mac_add_vlan_tag_chain(mblk_t *, uint_t, uint16_t);
+extern mblk_t *mac_strip_vlan_tag_chain(mblk_t *);
+extern void mac_pkt_drop(void *, mac_resource_handle_t, mblk_t *, boolean_t);
+extern mblk_t *mac_rx_flow(mac_handle_t, mac_resource_handle_t, mblk_t *);
+
+extern void i_mac_share_alloc(mac_client_impl_t *);
+extern void i_mac_share_free(mac_client_impl_t *);
+extern void i_mac_perim_enter(mac_impl_t *);
+extern void i_mac_perim_exit(mac_impl_t *);
+extern int i_mac_perim_enter_nowait(mac_impl_t *);
+extern void i_mac_tx_srs_notify(mac_impl_t *, mac_ring_handle_t);
+extern int mac_hold(const char *, mac_impl_t **);
+extern void mac_rele(mac_impl_t *);
+extern int i_mac_disable(mac_impl_t *);
+extern void i_mac_notify(mac_impl_t *, mac_notify_type_t);
+extern void i_mac_notify_exit(mac_impl_t *);
+extern int mac_start(mac_impl_t *);
+extern void mac_stop(mac_impl_t *);
+extern void mac_rx_group_unmark(mac_group_t *, uint_t);
+extern void mac_tx_client_flush(mac_client_impl_t *);
+extern void mac_tx_client_block(mac_client_impl_t *);
+extern void mac_tx_client_unblock(mac_client_impl_t *);
+extern int i_mac_promisc_set(mac_impl_t *, boolean_t, mac_promisc_type_t);
+extern void i_mac_promisc_walker_cleanup(mac_impl_t *);
+extern mactype_t *mactype_getplugin(const char *);
+extern void mac_addr_factory_init(mac_impl_t *);
+extern void mac_addr_factory_fini(mac_impl_t *);
+extern void mac_register_priv_prop(mac_impl_t *, mac_priv_prop_t *, uint_t);
+extern void mac_unregister_priv_prop(mac_impl_t *);
+extern int mac_init_rings(mac_impl_t *, mac_ring_type_t);
+extern void mac_free_rings(mac_impl_t *, mac_ring_type_t);
+
+extern int mac_start_group(mac_group_t *);
+extern void mac_stop_group(mac_group_t *);
+extern int mac_start_ring(mac_ring_t *);
+extern void mac_stop_ring(mac_ring_t *);
+extern int mac_add_macaddr(mac_impl_t *, mac_group_t *, uint8_t *);
+extern int mac_remove_macaddr(mac_address_t *);
+
+extern void mac_set_rx_group_state(mac_group_t *, mac_group_state_t);
+extern void mac_rx_group_add_client(mac_group_t *, mac_client_impl_t *);
+extern void mac_rx_group_remove_client(mac_group_t *, mac_client_impl_t *)
+;
+extern int i_mac_group_add_ring(mac_group_t *, mac_ring_t *, int);
+extern void i_mac_group_rem_ring(mac_group_t *, mac_ring_t *, boolean_t);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h
new file mode 100644
index 0000000000..9564efc00d
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_provider.h
@@ -0,0 +1,478 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_MAC_PROVIDER_H
+#define	_SYS_MAC_PROVIDER_H
+
+#include <sys/types.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/stream.h>
+#include <sys/mac_flow.h>
+#include <sys/mac.h>
+
+/*
+ * MAC Provider Interface
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * MAC version identifier.  This is used by mac_alloc() mac_register() to
+ * verify that incompatible drivers don't register.
+ */
+#define	MAC_VERSION	0x1
+
+/*
+ * Opaque handle types
+ */
+typedef struct __mac_rule_handle	*mac_rule_handle_t;
+
+/*
+ * Statistics
+ */
+
+#define	XCVR_UNDEFINED		0
+#define	XCVR_NONE		1
+#define	XCVR_10			2
+#define	XCVR_100T4		3
+#define	XCVR_100X		4
+#define	XCVR_100T2		5
+#define	XCVR_1000X		6
+#define	XCVR_1000T		7
+
+#ifdef	_KERNEL
+
+/*
+ * Definitions for MAC Drivers Capabilities
+ */
+/*
+ * MAC layer capabilities.  These capabilities are handled by the drivers'
+ * mc_capab_get() callbacks.  Some capabilities require the driver to fill
+ * in a given data structure, and others are simply boolean capabilities.
+ * Note that capability values must be powers of 2 so that consumers and
+ * providers of this interface can keep track of which capabilities they
+ * care about by keeping a bitfield of these things around somewhere.
+ */
+typedef enum {
+	/*
+	 * Capabilities reserved for internal use only
+	 */
+	MAC_CAPAB_VNIC		= 0x0001, /* data is mac_capab_vnic_t */
+	MAC_CAPAB_ANCHOR_VNIC	= 0x0002, /* boolean only, no data */
+	MAC_CAPAB_AGGR		= 0x0004, /* data is mac_capab_aggr_t */
+	MAC_CAPAB_NO_NATIVEVLAN	= 0x0008, /* boolean only, no data */
+	MAC_CAPAB_NO_ZCOPY	= 0x0010, /* boolean only, no data */
+	MAC_CAPAB_LEGACY	= 0x0020, /* data is mac_capab_legacy_t */
+
+	/*
+	 * Public Capabilities
+	 */
+	MAC_CAPAB_HCKSUM	= 0x0100, /* data is a uint32_t */
+	MAC_CAPAB_LSO		= 0x0200, /* data is mac_capab_lso_t */
+	MAC_CAPAB_RINGS		= 0x0400, /* data is mac_capab_rings_t */
+	MAC_CAPAB_MULTIFACTADDR = 0x0800, /* mac_data_multifactaddr_t */
+	MAC_CAPAB_SHARES	= 0x1000 /* data is mac_capab_share_t */
+
+	/* add new capabilities here */
+} mac_capab_t;
+
+
+/*
+ * LSO capability
+ */
+typedef struct lso_basic_tcp_ipv4_s {
+	t_uscalar_t	lso_max;		/* maximum payload */
+} lso_basic_tcp_ipv4_t;
+
+/*
+ * Currently supported flags for LSO.
+ */
+#define	LSO_TX_BASIC_TCP_IPV4	0x01		/* TCP LSO capability */
+
+/*
+ * Future LSO capabilities can be added at the end of the mac_capab_lso_t.
+ * When such capability is added to the GLDv3 framework, the size of the
+ * mac_capab_lso_t it allocates and passes to the drivers increases. Older
+ * drivers wil access only the (upper) sections of that structure, that is the
+ * sections carrying the capabilities they understand. This ensures the
+ * interface can be safely extended in a binary compatible way.
+ */
+typedef	struct mac_capab_lso_s {
+	t_uscalar_t		lso_flags;
+	lso_basic_tcp_ipv4_t	lso_basic_tcp_ipv4;
+	/* Add future lso capabilities here */
+} mac_capab_lso_t;
+
+/*
+ * Multiple Factory MAC Addresses Capability
+ */
+typedef struct mac_capab_multifactaddr_s {
+	/*
+	 * Number of factory addresses
+	 */
+	uint_t		mcm_naddr;
+
+	/*
+	 * Callbacks to query all the factory addresses.
+	 */
+	void		(*mcm_getaddr)(void *, uint_t, uint8_t *);
+} mac_capab_multifactaddr_t;
+
+/*
+ * MAC driver entry point types.
+ */
+typedef int		(*mac_getstat_t)(void *, uint_t, uint64_t *);
+typedef	int		(*mac_start_t)(void *);
+typedef void		(*mac_stop_t)(void *);
+typedef int		(*mac_setpromisc_t)(void *, boolean_t);
+typedef int		(*mac_multicst_t)(void *, boolean_t, const uint8_t *);
+typedef int		(*mac_unicst_t)(void *, const uint8_t *);
+typedef void		(*mac_ioctl_t)(void *, queue_t *, mblk_t *);
+typedef void		(*mac_resources_t)(void *);
+typedef mblk_t		*(*mac_tx_t)(void *, mblk_t *);
+typedef	boolean_t	(*mac_getcapab_t)(void *, mac_capab_t, void *);
+typedef	int		(*mac_open_t)(void *);
+typedef void		(*mac_close_t)(void *);
+typedef	int		(*mac_set_prop_t)(void *, const char *, mac_prop_id_t,
+			    uint_t, const void *);
+typedef	int		(*mac_get_prop_t)(void *, const char *, mac_prop_id_t,
+			    uint_t, uint_t, void *, uint_t *);
+
+/*
+ * Drivers must set all of these callbacks except for mc_resources,
+ * mc_ioctl, and mc_getcapab, which are optional.  If any of these optional
+ * callbacks are set, their appropriate flags must be set in mc_callbacks.
+ * Any future additions to this list must also be accompanied by an
+ * associated mc_callbacks flag so that the framework can grow without
+ * affecting the binary compatibility of the interface.
+ */
+typedef struct mac_callbacks_s {
+	uint_t		mc_callbacks;	/* Denotes which callbacks are set */
+	mac_getstat_t	mc_getstat;	/* Get the value of a statistic */
+	mac_start_t	mc_start;	/* Start the device */
+	mac_stop_t	mc_stop;	/* Stop the device */
+	mac_setpromisc_t mc_setpromisc;	/* Enable or disable promiscuous mode */
+	mac_multicst_t	mc_multicst;	/* Enable or disable a multicast addr */
+	mac_unicst_t	mc_unicst;	/* Set the unicast MAC address */
+	mac_tx_t	mc_tx;		/* Transmit a packet */
+	mac_ioctl_t	mc_ioctl;	/* Process an unknown ioctl */
+	mac_getcapab_t	mc_getcapab;	/* Get capability information */
+	mac_open_t	mc_open;	/* Open the device */
+	mac_close_t	mc_close;	/* Close the device */
+	mac_set_prop_t	mc_setprop;
+	mac_get_prop_t	mc_getprop;
+} mac_callbacks_t;
+
+typedef struct mac_priv_prop_s {
+	char	mpp_name[MAXLINKPROPNAME];
+	uint_t	mpp_flags;
+} mac_priv_prop_t;
+
+/*
+ * Virtualization Capabilities
+ */
+/*
+ * The ordering of entries below is important. MAC_HW_CLASSIFIER
+ * is the cutoff below which are entries which don't depend on
+ * H/W. MAC_HW_CLASSIFIER and entries after that are cases where
+ * H/W has been updated through add/modify/delete APIs.
+ */
+typedef enum {
+	MAC_NO_CLASSIFIER = 0,
+	MAC_SW_CLASSIFIER,
+	MAC_HW_CLASSIFIER
+} mac_classify_type_t;
+
+typedef	void	(*mac_rx_func_t)(void *, mac_resource_handle_t, mblk_t *,
+    boolean_t);
+
+/*
+ * The virtualization level conveys the extent of the NIC hardware assistance
+ * for traffic steering employed for virtualization:
+ *
+ * MAC_VIRT_NONE:	No assist for v12n.
+ *
+ * MAC_VIRT_LEVEL1:	Multiple Rx rings with MAC address level
+ *			classification between groups of rings.
+ *			Requires the support of the MAC_CAPAB_RINGS
+ *			capability.
+ *
+ * MAC_VIRT_HIO:	Hybrid I/O capable MAC. Require the support
+ *			of the MAC_CAPAB_SHARES capability.
+ *
+ * MAC_VIRT_SERIALIZE:	Temporary flag *ONLY* for nxge. Mac layer
+ *			uses this to enable mac Tx serializer on
+ *			outbound traffic and to always enqueue
+ * 			incoming traffic on Rx soft rings in mac.
+ */
+#define	MAC_VIRT_NONE		0x0
+#define	MAC_VIRT_LEVEL1		0x1
+#define	MAC_VIRT_HIO		0x2
+#define	MAC_VIRT_SERIALIZE	0x4
+
+typedef enum {
+	MAC_RING_TYPE_RX = 1,	/* Receive ring */
+	MAC_RING_TYPE_TX	/* Transmit ring */
+} mac_ring_type_t;
+
+#define	MAX_RINGS_PER_GROUP	32
+
+/*
+ * Grouping type of a ring group
+ *
+ * MAC_GROUP_TYPE_STATIC: The ring group can not be re-grouped.
+ * MAC_GROUP_TYPE_DYNAMIC: The ring group support dynamic re-grouping
+ */
+typedef enum {
+	MAC_GROUP_TYPE_STATIC = 1,	/* Static ring group */
+	MAC_GROUP_TYPE_DYNAMIC		/* Dynamic ring group */
+} mac_group_type_t;
+
+typedef	struct __mac_ring_driver	*mac_ring_driver_t;
+typedef	struct __mac_group_driver	*mac_group_driver_t;
+
+typedef	struct mac_ring_info_s mac_ring_info_t;
+typedef	struct mac_group_info_s mac_group_info_t;
+
+typedef void	(*mac_get_ring_t)(void *, mac_ring_type_t, const int, const int,
+    mac_ring_info_t *, mac_ring_handle_t);
+typedef void	(*mac_get_group_t)(void *, mac_ring_type_t, const int,
+    mac_group_info_t *, mac_group_handle_t);
+
+typedef void	(*mac_group_add_ring_t)(mac_group_driver_t,
+    mac_ring_driver_t, mac_ring_type_t);
+typedef void	(*mac_group_rem_ring_t)(mac_group_driver_t,
+    mac_ring_driver_t, mac_ring_type_t);
+
+/*
+ * Multiple Rings Capability
+ */
+typedef struct	mac_capab_rings_s {
+	mac_ring_type_t		mr_type;	/* Ring type: Rx vs Tx */
+	mac_group_type_t	mr_group_type;	/* Dynamic vs static grouping */
+	uint_t			mr_rnum;	/* Number of rings */
+	uint_t			mr_gnum;	/* Number of ring groups */
+	mac_get_ring_t		mr_rget;	/* Get ring from driver */
+	mac_get_group_t		mr_gget;	/* Get ring group from driver */
+	mac_group_add_ring_t	mr_gaddring;	/* Add ring into a group */
+	mac_group_rem_ring_t	mr_gremring;	/* Remove ring from a group */
+} mac_capab_rings_t;
+
+/*
+ * Common ring functions and driver interfaces
+ */
+typedef	int	(*mac_ring_start_t)(mac_ring_driver_t, uint64_t);
+typedef	void	(*mac_ring_stop_t)(mac_ring_driver_t);
+
+typedef	mblk_t	*(*mac_ring_send_t)(void *, mblk_t *);
+typedef	mblk_t	*(*mac_ring_poll_t)(void *, int);
+
+typedef struct mac_ring_info_s {
+	mac_ring_driver_t	mri_driver;
+	mac_ring_start_t	mri_start;
+	mac_ring_stop_t		mri_stop;
+	mac_intr_t		mri_intr;
+	union {
+		mac_ring_send_t	send;
+		mac_ring_poll_t	poll;
+	} mrfunion;
+} mac_ring_info_s;
+
+#define	mri_tx			mrfunion.send
+#define	mri_poll		mrfunion.poll
+
+typedef	int	(*mac_group_start_t)(mac_group_driver_t);
+typedef	void	(*mac_group_stop_t)(mac_group_driver_t);
+typedef	int	(*mac_add_mac_addr_t)(void *, const uint8_t *);
+typedef	int	(*mac_rem_mac_addr_t)(void *, const uint8_t *);
+
+struct mac_group_info_s {
+	mac_group_driver_t	mgi_driver;	/* Driver reference */
+	mac_group_start_t	mgi_start;	/* Start the group */
+	mac_group_stop_t	mgi_stop;	/* Stop the group */
+	uint_t			mgi_count;	/* Count of rings */
+	mac_intr_t		mgi_intr;	/* Optional per-group intr */
+
+	/* Only used for rx groups */
+	mac_add_mac_addr_t	mgi_addmac;	/* Add a MAC address */
+	mac_rem_mac_addr_t	mgi_remmac;	/* Remove a MAC address */
+};
+
+/*
+ * Share management functions.
+ */
+typedef uint64_t mac_share_handle_t;
+
+/*
+ * Allocate and free a share. Returns ENOSPC if all shares have been
+ * previously allocated.
+ */
+typedef int (*mac_alloc_share_t)(void *, mac_share_handle_t *);
+typedef void (*mac_free_share_t)(mac_share_handle_t);
+
+/*
+ * Bind and unbind a share. Binding a share allows a domain
+ * to have direct access to the groups and rings associated with
+ * that share.
+ */
+typedef int (*mac_bind_share_t)(mac_share_handle_t, uint64_t, uint64_t *);
+typedef void (*mac_unbind_share_t)(mac_share_handle_t);
+
+/*
+ * Return information on about a share.
+ */
+typedef void (*mac_share_query_t)(mac_share_handle_t, mac_ring_type_t,
+    mac_ring_handle_t *, uint_t *);
+
+/*
+ * Basic idea, bind previously created ring groups to shares
+ * for them to be exported (or shared) by another domain.
+ * These interfaces bind/unbind the ring group to a share.
+ * The groups and their rings will be shared with the guest
+ * as soon as the share is bound.
+ */
+typedef int (*mac_share_add_group_t)(mac_share_handle_t,
+    mac_group_driver_t);
+typedef int (*mac_share_rem_group_t)(mac_share_handle_t,
+    mac_group_driver_t);
+
+typedef struct  mac_capab_share_s {
+	uint_t			ms_snum;	/* Number of shares (vr's) */
+	void			*ms_handle;	/* Handle to driver. */
+	mac_alloc_share_t	ms_salloc;	/* Get a share from driver. */
+	mac_free_share_t	ms_sfree;	/* Return a share to driver. */
+	mac_share_add_group_t	ms_sadd;	/* Add a group to the share. */
+	mac_share_rem_group_t	ms_sremove;	/* Remove group from share. */
+	mac_share_query_t	ms_squery;	/* Query share constraints */
+	mac_bind_share_t	ms_sbind;	/* Bind a share */
+	mac_unbind_share_t	ms_sunbind;	/* Unbind a share */
+} mac_capab_share_t;
+
+/*
+ * MAC registration interface
+ */
+typedef struct mac_register_s {
+	uint_t			m_version;	/* set by mac_alloc() */
+	const char		*m_type_ident;
+	void			*m_driver;	/* Driver private data */
+	dev_info_t		*m_dip;
+	uint_t			m_instance;
+	uint8_t			*m_src_addr;
+	uint8_t			*m_dst_addr;
+	mac_callbacks_t		*m_callbacks;
+	uint_t			m_min_sdu;
+	uint_t			m_max_sdu;
+	void			*m_pdata;
+	size_t			m_pdata_size;
+	uint32_t		m_margin;
+	mac_priv_prop_t		*m_priv_props;
+	size_t			m_priv_prop_count;
+	uint32_t		m_v12n;		/* Virtualization level */
+} mac_register_t;
+
+/*
+ * Flags for mc_callbacks.  Requiring drivers to set the flags associated
+ * with optional callbacks initialized in the structure allows the mac
+ * module to add optional callbacks in the future without requiring drivers
+ * to recompile.
+ */
+#define	MC_IOCTL	0x001
+#define	MC_GETCAPAB	0x002
+#define	MC_OPEN		0x004
+#define	MC_CLOSE	0x008
+#define	MC_SETPROP	0x010
+#define	MC_GETPROP	0x020
+
+/*
+ * Driver interface functions.
+ */
+extern void			mac_sdu_get(mac_handle_t, uint_t *, uint_t *);
+extern int			mac_maxsdu_update(mac_handle_t, uint_t);
+extern int			mac_set_prop(mac_handle_t, mac_prop_t *,
+				    void *, uint_t);
+extern int			mac_get_prop(mac_handle_t, mac_prop_t *,
+				    void *, uint_t, uint_t *);
+
+extern mac_register_t		*mac_alloc(uint_t);
+extern void			mac_free(mac_register_t *);
+extern int			mac_register(mac_register_t *, mac_handle_t *);
+extern int			mac_disable_nowait(mac_handle_t);
+extern int			mac_disable(mac_handle_t);
+extern int  			mac_unregister(mac_handle_t);
+extern void 			mac_rx(mac_handle_t, mac_resource_handle_t,
+				    mblk_t *);
+extern void 			mac_rx_ring(mac_handle_t, mac_ring_handle_t,
+				    mblk_t *, uint64_t);
+extern void 			mac_link_update(mac_handle_t, link_state_t);
+extern void 			mac_unicst_update(mac_handle_t,
+				    const uint8_t *);
+extern void			mac_tx_update(mac_handle_t);
+extern void			mac_tx_ring_update(mac_handle_t,
+				    mac_ring_handle_t);
+extern void			mac_resource_update(mac_handle_t);
+extern void			mac_capab_update(mac_handle_t);
+extern int			mac_pdata_update(mac_handle_t, void *,
+				    size_t);
+extern void			mac_multicast_refresh(mac_handle_t,
+				    mac_multicst_t, void *, boolean_t);
+extern void			mac_unicst_refresh(mac_handle_t, mac_unicst_t,
+				    void *);
+extern void			mac_promisc_refresh(mac_handle_t,
+				    mac_setpromisc_t, void *);
+extern boolean_t		mac_margin_update(mac_handle_t, uint32_t);
+extern void			mac_margin_get(mac_handle_t, uint32_t *);
+extern int			mac_margin_remove(mac_handle_t, uint32_t);
+extern int			mac_margin_add(mac_handle_t, uint32_t *,
+				    boolean_t);
+extern void			mac_init_ops(struct dev_ops *, const char *);
+extern void			mac_fini_ops(struct dev_ops *);
+extern uint32_t			mac_no_notification(mac_handle_t);
+
+extern mactype_register_t	*mactype_alloc(uint_t);
+extern void			mactype_free(mactype_register_t *);
+extern int			mactype_register(mactype_register_t *);
+extern int			mactype_unregister(const char *);
+extern void			mac_set_ring(void *, void *);
+
+extern boolean_t		mac_unicst_verify(mac_handle_t,
+				    const uint8_t *, uint_t);
+
+extern boolean_t		mac_is_vnic(mac_handle_t);
+
+extern int			mac_group_add_ring(mac_group_handle_t, int);
+extern void			mac_group_rem_ring(mac_group_handle_t,
+				    mac_ring_handle_t);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_MAC_PROVIDER_H */
diff --git a/usr/src/uts/common/sys/mac_soft_ring.h b/usr/src/uts/common/sys/mac_soft_ring.h
new file mode 100644
index 0000000000..45fcdf65bf
--- /dev/null
+++ b/usr/src/uts/common/sys/mac_soft_ring.h
@@ -0,0 +1,724 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_MAC_SOFT_RING_H
+#define	_SYS_MAC_SOFT_RING_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/cpuvar.h>
+#include <sys/processor.h>
+#include <sys/stream.h>
+#include <sys/squeue.h>
+#include <sys/dlpi.h>
+#include <sys/mac_impl.h>
+
+#define	S_RING_NAMELEN 64
+
+#define	MAX_SR_FANOUT	32
+
+extern boolean_t mac_soft_ring_enable;
+extern boolean_t mac_latency_optimize;
+
+typedef struct mac_soft_ring_s mac_soft_ring_t;
+typedef struct mac_soft_ring_set_s mac_soft_ring_set_t;
+
+typedef void (*mac_soft_ring_drain_func_t)(mac_soft_ring_t *);
+typedef mac_tx_cookie_t (*mac_tx_func_t)(mac_soft_ring_set_t *, mblk_t *,
+    uintptr_t, uint16_t, mblk_t **);
+
+
+/* Tx notify callback */
+typedef struct mac_tx_notify_cb_s {
+	mac_cb_t		mtnf_link;	/* Linked list of callbacks */
+	mac_tx_notify_t		mtnf_fn;	/* The callback function */
+	void			*mtnf_arg;	/* Callback function argument */
+} mac_tx_notify_cb_t;
+
+struct mac_soft_ring_s {
+	/* Keep the most used members 64bytes cache aligned */
+	kmutex_t	s_ring_lock;	/* lock before using any member */
+	uint16_t	s_ring_type;	/* processing model of the sq */
+	uint16_t	s_ring_state;	/* state flags and message count */
+	int		s_ring_count;	/* # of mblocks in mac_soft_ring */
+	size_t		s_ring_size;	/* Size of data queued */
+	mblk_t		*s_ring_first;	/* first mblk chain or NULL */
+	mblk_t		*s_ring_last;	/* last mblk chain or NULL */
+
+	mac_direct_rx_t	s_ring_rx_func;
+	void		*s_ring_rx_arg1;
+	mac_resource_handle_t  s_ring_rx_arg2;
+
+	/*
+	 * Threshold after which packets get dropped.
+	 * Is always greater than s_ring_tx_hiwat
+	 */
+	int		s_ring_tx_max_q_cnt;
+	/* # of mblocks after which to apply flow control */
+	int		s_ring_tx_hiwat;
+	/* # of mblocks after which to relieve flow control */
+	int		s_ring_tx_lowat;
+	boolean_t	s_ring_tx_woken_up;
+	uint32_t	s_ring_blocked_cnt;	/* times blocked for Tx descs */
+	uint32_t	s_ring_unblocked_cnt;	/* unblock calls from driver */
+	uint32_t	s_ring_hiwat_cnt;	/* times blocked for Tx descs */
+
+	void		*s_ring_tx_arg1;
+	void		*s_ring_tx_arg2;
+
+	/* Tx notify callback */
+	mac_cb_info_t	s_ring_notify_cb_info;		/* cb list info */
+	mac_cb_t	*s_ring_notify_cb_list;		/* The cb list */
+
+	clock_t		s_ring_awaken;	/* time async thread was awakened */
+
+	kthread_t	*s_ring_run;	/* Current thread processing sq */
+	processorid_t	s_ring_cpuid;	/* processor to bind to */
+	processorid_t	s_ring_cpuid_save;	/* saved cpuid during offline */
+	kcondvar_t	s_ring_async;	/* async thread blocks on */
+	clock_t		s_ring_wait;	/* lbolts to wait after a fill() */
+	timeout_id_t	s_ring_tid;	/* timer id of pending timeout() */
+	kthread_t	*s_ring_worker;	/* kernel thread id */
+	char		s_ring_name[S_RING_NAMELEN + 1];
+	uint32_t	s_ring_total_inpkt;
+	uint32_t	s_ring_drops;
+	struct mac_client_impl_s *s_ring_mcip;
+	void		*s_ring_flent;
+	kstat_t		*s_ring_ksp;
+
+	/* Teardown, poll disable control ops */
+	kcondvar_t	s_ring_client_cv; /* Client wait for control op */
+
+	mac_soft_ring_set_t *s_ring_set;   /* The SRS this ring belongs to */
+	mac_soft_ring_t	*s_ring_next;
+	mac_soft_ring_t	*s_ring_prev;
+	mac_soft_ring_drain_func_t s_ring_drain_func;
+};
+
+typedef void (*mac_srs_drain_proc_t)(mac_soft_ring_set_t *, uint_t);
+
+/* Transmit side Soft Ring Set */
+typedef struct mac_srs_tx_s {
+	/* Members for Tx size processing */
+	uint32_t	st_mode;
+	mac_tx_func_t	st_func;
+	void		*st_arg1;
+	void		*st_arg2;
+	mac_group_t	*st_group;	/* TX group for share */
+	boolean_t	st_woken_up;
+
+	/*
+	 * st_max_q_cnt is the queue depth threshold to limit
+	 * outstanding packets on the Tx SRS. Once the limit
+	 * is reached, Tx SRS will drop packets until the
+	 * limit goes below the threshold.
+	 */
+	uint32_t	st_max_q_cnt;	/* max. outstanding packets */
+	/*
+	 * st_hiwat is used Tx serializer and bandwidth mode.
+	 * This is the queue depth threshold upto which
+	 * packets will get buffered with no flow-control
+	 * back pressure applied to the caller. Once this
+	 * threshold is reached, back pressure will be
+	 * applied to the caller of mac_tx() (mac_tx() starts
+	 * returning a cookie to indicate a blocked SRS).
+	 * st_hiwat should always be lesser than or equal to
+	 * st_max_q_cnt.
+	 */
+	uint32_t	st_hiwat;	/* mblk cnt to apply flow control */
+	uint32_t	st_lowat;	/* mblk cnt to relieve flow control */
+	uint32_t	st_drop_count;
+	/*
+	 * Number of times the srs gets blocked due to lack of Tx
+	 * desc is noted down. Corresponding wakeup from driver
+	 * to unblock is also noted down. They should match in a
+	 * correctly working setup. If there is less unblocks
+	 * than blocks, then Tx side waits forever for a wakeup
+	 * from below. The following protected by srs_lock.
+	 */
+	uint32_t	st_blocked_cnt; /* times blocked for Tx descs */
+	uint32_t	st_unblocked_cnt; /* unblock calls from driver */
+	uint32_t	st_hiwat_cnt; /* times blocked for Tx descs */
+} mac_srs_tx_t;
+
+/* Receive side Soft Ring Set */
+typedef struct mac_srs_rx_s {
+	/*
+	 * Upcall Function for fanout, Rx processing etc. Perhaps
+	 * the same 3 members below can be used for Tx
+	 * processing, but looking around, mac_rx_func_t has
+	 * proliferated too much into various files at different
+	 * places. I am leaving the consolidation battle for
+	 * another day.
+	 */
+	mac_direct_rx_t		sr_func;	/* srs_lock */
+	void			*sr_arg1;	/* srs_lock */
+	mac_resource_handle_t 	sr_arg2;	/* srs_lock */
+	mac_rx_func_t		sr_lower_proc;	/* Atomically changed */
+	boolean_t		sr_enqueue_always; /* enqueue at soft ring */
+	uint32_t		sr_poll_pkt_cnt;
+	uint32_t		sr_poll_thres;
+
+	/* mblk cnt to apply flow control */
+	uint32_t		sr_hiwat;
+	/* mblk cnt to relieve flow control */
+	uint32_t		sr_lowat;
+	uint32_t		sr_poll_count;
+	uint32_t		sr_intr_count;
+	uint32_t		sr_drop_count;
+
+	/* Times polling was enabled */
+	uint32_t		sr_poll_on;
+	/* Times polling was enabled by worker thread */
+	uint32_t		sr_worker_poll_on;
+	/* Times polling was disabled */
+	uint32_t		sr_poll_off;
+	/* Poll thread signalled count */
+	uint32_t		sr_poll_thr_sig;
+	/* Poll thread busy */
+	uint32_t		sr_poll_thr_busy;
+	/* SRS drains, stays in poll mode but doesn't poll */
+	uint32_t		sr_poll_drain_no_poll;
+	/*
+	 * SRS has nothing to do and no packets in H/W but
+	 * there is a backlog in softrings. SRS stays in
+	 * poll mode but doesn't do polling.
+	 */
+	uint32_t		sr_poll_no_poll;
+	/* Active polling restarted */
+	uint32_t		sr_below_hiwat;
+	/* Found packets in last poll so try and poll again */
+	uint32_t		sr_poll_again;
+	/*
+	 * Packets in queue but poll thread not allowed to process so
+	 * signal the worker thread.
+	 */
+	uint32_t		sr_poll_sig_worker;
+	/*
+	 * Poll thread has nothing to do and H/W has nothing so
+	 * reenable the interrupts.
+	 */
+	uint32_t		sr_poll_intr_enable;
+	/*
+	 * Poll thread has nothing to do and worker thread was already
+	 * running so it can decide to reenable interrupt or poll again.
+	 */
+	uint32_t		sr_poll_goto_sleep;
+	/* Worker thread goes back to draining the queue */
+	uint32_t		sr_drain_again;
+	/* More Packets in queue so signal the worker thread to drain */
+	uint32_t		sr_drain_worker_sig;
+	/* Poll thread is already running so worker has nothing to do */
+	uint32_t		sr_drain_poll_running;
+	/* We have packets already queued so keep polling */
+	uint32_t		sr_drain_keep_polling;
+	/* Drain is done and interrupts are reenabled */
+	uint32_t		sr_drain_finish_intr;
+	/* Polling thread needs to schedule worker wakeup */
+	uint32_t		sr_poll_worker_wakeup;
+
+	/* Chains less than 10 pkts */
+	uint32_t		sr_chain_cnt_undr10;
+	/* Chains between 10 & 50 pkts */
+	uint32_t		sr_chain_cnt_10to50;
+	/* Chains over 50 pkts */
+	uint32_t		sr_chain_cnt_over50;
+} mac_srs_rx_t;
+
+/*
+ * mac_soft_ring_set_s:
+ * This is used both for Tx and Rx side. The srs_type identifies Rx or
+ * Tx type.
+ *
+ * Note that the structure is carefully crafted, with Rx elements coming
+ * first followed by Tx specific members. Future additions to this
+ * structure should follow the same guidelines.
+ *
+ * Rx-side notes:
+ * mac_rx_classify_flow_add() always creates a mac_soft_ring_set_t and fn_flow
+ * points to info from it (func = srs_lower_proc, arg = soft_ring_set). On
+ * interrupt path, srs_lower_proc does B/W adjustment and switch to polling mode
+ * (if poll capable) and feeds the packets to soft_ring_list via choosen
+ * fanout type (specified by srs_type). In poll mode, the poll thread which is
+ * also a pointer can pick up the packets and feed them to various
+ * soft_ring_list.
+ *
+ * The srs_type can either be protocol based or fanout based where fanout itelf
+ * can be various types
+ *
+ * The polling works by turning off interrupts as soon as a packets
+ * are queued on the soft ring set. Once the backlog is clear and poll
+ * thread return empty handed i.e. Rx ring doesn't have anything, the
+ * interrupt is turned back on. For this purpose we keep a separate
+ * srs_poll_pkt_cnt counter which tracks the packets queued between SRS
+ * and the soft rings as well. The counter is incremented when packets
+ * are queued and decremented when SRS processes them (in case it has
+ * no soft rings) or the soft ring process them. Its important that
+ * in case SRS has softrings, the decrement doesn't happen till the
+ * packet is processed by the soft rings since it takes very little time
+ * for SRS to queue packet from SRS to soft rings and it will keep
+ * bringing more packets in the system faster than soft rings can
+ * process them.
+ *
+ * Tx side notes:
+ * The srs structure acts as a serializer with a worker thread. The
+ * default behavior of srs though is to act as a pass-thru. The queues
+ * (srs_first, srs_last, srs_count) get used when Tx ring runs out of Tx
+ * descriptors or to enforce bandwidth limits.
+ *
+ * When multiple Tx rings are present, the SRS state will be set to
+ * SRS_FANOUT_OTH. Outgoing packets coming into mac_tx_srs_process()
+ * function will be fanned out to one of the Tx side soft rings based on
+ * a hint passed in mac_tx_srs_process(). Each soft ring, in turn, will
+ * be associated with a distinct h/w Tx ring.
+ */
+
+struct mac_soft_ring_set_s {
+	/*
+	 * Common elements, common to both Rx and Tx SRS type.
+	 * The following block of fields are protected by srs_lock
+	 */
+	kmutex_t	srs_lock;
+	uint32_t	srs_type;
+	uint32_t	srs_state;	/* state flags */
+	uint32_t	srs_count;
+	mblk_t		*srs_first;	/* first mblk chain or NULL */
+	mblk_t		*srs_last;	/* last mblk chain or NULL */
+	kcondvar_t	srs_async;	/* cv for worker thread */
+	kcondvar_t	srs_cv;		/* cv for poll thread */
+	kcondvar_t	srs_quiesce_done_cv;	/* cv for removal */
+	timeout_id_t	srs_tid;	/* timeout id for pending timeout */
+
+	/*
+	 * List of soft rings & processing function.
+	 * The following block is protected by Rx quiescence.
+	 * i.e. they can be changed only after quiescing the SRS
+	 * Protected by srs_lock.
+	 */
+	mac_soft_ring_t	*srs_soft_ring_head;
+	mac_soft_ring_t	*srs_soft_ring_tail;
+	int		srs_soft_ring_count;
+	int		srs_soft_ring_quiesced_count;
+	int		srs_soft_ring_condemned_count;
+	mac_soft_ring_t	**srs_tcp_soft_rings;
+	int		srs_tcp_ring_count;
+	mac_soft_ring_t	**srs_udp_soft_rings;
+	int		srs_udp_ring_count;
+	/*
+	 * srs_oth_soft_rings is also used by tx_srs in
+	 * when operating in multi tx ring mode.
+	 */
+	mac_soft_ring_t	**srs_oth_soft_rings;
+	int		srs_oth_ring_count;
+
+	/*
+	 * Bandwidth control related members.
+	 * They are common to both Rx- and Tx-side.
+	 * Following protected by srs_lock
+	 */
+	mac_bw_ctl_t	*srs_bw;
+	size_t		srs_size;	/* Size of packets queued in bytes */
+	pri_t		srs_pri;
+
+	mac_soft_ring_set_t	*srs_next;	/* mac_srs_g_lock */
+	mac_soft_ring_set_t	*srs_prev;	/* mac_srs_g_lock */
+
+	/* Attribute specific drain func (BW ctl vs non-BW ctl)	*/
+	mac_srs_drain_proc_t	srs_drain_func;	/* Write once (WO) */
+
+	/*
+	 * If the associated ring is exclusively used by a mac client, e.g.,
+	 * an aggregation, this fields is used to keep a reference to the
+	 * MAC client's pseudo ring.
+	 */
+	mac_resource_handle_t	srs_mrh;
+	/*
+	 * The following blocks are write once (WO) and valid for the life
+	 * of the SRS
+	 */
+	struct mac_client_impl_s *srs_mcip;	/* back ptr to mac client */
+	void			*srs_flent;	/* back ptr to flent */
+	mac_ring_t		*srs_ring;	/*  Ring Descriptor */
+
+	/* Teardown, disable control ops */
+	kcondvar_t	srs_client_cv;	/* Client wait for the control op */
+
+	kthread_t	*srs_worker;	/* WO, worker thread */
+	kthread_t	*srs_poll_thr;	/* WO, poll thread */
+
+	uint_t		srs_ind;	/* Round Robin indx for picking up SR */
+	processorid_t	srs_worker_cpuid;	/* processor to bind to */
+	processorid_t	srs_worker_cpuid_save;	/* saved cpuid during offline */
+	processorid_t	srs_poll_cpuid;		/* processor to bind to */
+	processorid_t	srs_poll_cpuid_save;	/* saved cpuid during offline */
+	uint_t		srs_fanout_state;
+	mac_cpus_t	srs_cpu;
+
+	mac_srs_rx_t	srs_rx;
+	mac_srs_tx_t	srs_tx;
+};
+
+/*
+ * type flags - combination allowed to process and drain the queue
+ */
+#define	ST_RING_WORKER_ONLY  	0x0001	/* Worker thread only */
+#define	ST_RING_ANY		0x0002	/* Any thread can process the queue */
+#define	ST_RING_TCP		0x0004
+#define	ST_RING_UDP		0x0008
+#define	ST_RING_OTH		0x0010
+
+#define	ST_RING_BW_CTL		0x0020
+#define	ST_RING_TX		0x0040
+
+/*
+ * State flags.
+ */
+#define	S_RING_PROC		0x0001	/* being processed */
+#define	S_RING_BOUND		0x0002	/* Worker thread is bound to a cpu */
+#define	S_RING_BLOCK		0x0004	/* No Tx descs */
+#define	S_RING_TX_HIWAT		0x0008	/* Tx high watermark reached */
+
+#define	S_RING_WAKEUP_CLIENT	0x0010	/* flow ctrl, client wakeup needed */
+#define	S_RING_BLANK		0x0020	/* Has been put into polling mode */
+#define	S_RING_CLIENT_WAIT	0x0040	/* Client waiting for control op */
+
+#define	S_RING_CONDEMNED	0x0100	/* Being torn down */
+#define	S_RING_CONDEMNED_DONE	0x0200	/* Being torn down */
+#define	S_RING_QUIESCE		0x0400	/* No traffic flow, transient flag */
+#define	S_RING_QUIESCE_DONE	0x0800	/* No traffic flow, transient flag */
+
+#define	S_RING_RESTART		0x1000	/* Go back to normal traffic flow */
+#define	S_RING_ENQUEUED		0x2000	/* Pkts enqueued in Tx soft ring */
+
+/*
+ * arguments for processors to bind to
+ */
+#define	S_RING_BIND_NONE	-1
+
+/*
+ * defines for srs_type - identifies a link or a sub-flow
+ * and other static characteristics of a SRS like a tx
+ * srs, tcp only srs, etc.
+ */
+#define	SRST_LINK		0x00000001
+#define	SRST_FLOW		0x00000002
+#define	SRST_NO_SOFT_RINGS	0x00000004
+#define	SRST_TCP_ONLY		0x00000008
+
+#define	SRST_FANOUT_PROTO	0x00000010
+#define	SRST_FANOUT_SRC_IP	0x00000020
+#define	SRST_FANOUT_OTH		0x00000040
+#define	SRST_DEFAULT_GRP	0x00000080
+
+#define	SRST_TX			0x00000100
+#define	SRST_BW_CONTROL		0x00000200
+#define	SRST_DIRECT_POLL	0x00000400
+
+#define	SRST_DLS_BYPASS		0x00001000
+#define	SRST_CLIENT_POLL_ENABLED 0x00002000
+
+/*
+ * soft ring set flags. These bits are dynamic in nature and get
+ * applied to srs_state. They reflect the state of SRS at any
+ * point of time
+ */
+#define	SRS_BLANK		0x00000001
+#define	SRS_WORKER_BOUND	0x00000002
+#define	SRS_POLL_BOUND		0x00000004
+#define	SRS_POLLING_CAPAB	0x00000008
+
+#define	SRS_PROC		0x00000010
+#define	SRS_GET_PKTS		0x00000020
+#define	SRS_POLLING		0x00000040
+#define	SRS_BW_ENFORCED		0x00000080
+
+#define	SRS_WORKER		0x00000100
+#define	SRS_ENQUEUED		0x00000200
+#define	SRS_ANY_PROCESS		0x00000400
+#define	SRS_PROC_FAST		0x00000800
+
+#define	SRS_POLL_PROC		0x00001000
+#define	SRS_TX_BLOCKED		0x00002000	/* out of Tx descs */
+#define	SRS_TX_HIWAT		0x00004000	/* Tx count exceeds hiwat */
+#define	SRS_TX_WAKEUP_CLIENT	0x00008000	/* Flow-ctl: wakeup client */
+
+#define	SRS_CLIENT_PROC		0x00010000
+#define	SRS_CLIENT_WAIT		0x00020000
+#define	SRS_QUIESCE		0x00040000
+#define	SRS_QUIESCE_DONE	0x00080000
+
+#define	SRS_CONDEMNED		0x00100000
+#define	SRS_CONDEMNED_DONE	0x00200000
+#define	SRS_POLL_THR_QUIESCED	0x00400000
+#define	SRS_RESTART		0x00800000
+
+#define	SRS_RESTART_DONE	0x01000000
+#define	SRS_POLL_THR_RESTART	0x02000000
+#define	SRS_IN_GLIST		0x04000000
+#define	SRS_POLL_THR_EXITED	0x08000000
+
+#define	SRS_QUIESCE_PERM	0x10000000
+#define	SRS_LATENCY_OPT		0x20000000
+
+#define	SRS_QUIESCED(srs)	(srs->srs_state & SRS_QUIESCE_DONE)
+
+/*
+ * If the SRS_QUIESCE_PERM flag is set, the SRS worker thread will not be
+ * able to be restarted.
+ */
+#define	SRS_QUIESCED_PERMANENT(srs)	(srs->srs_state & SRS_QUIESCE_PERM)
+
+/*
+ * soft ring set (SRS) Tx modes
+ */
+typedef enum {
+	SRS_TX_DEFAULT = 0,
+	SRS_TX_SERIALIZE,
+	SRS_TX_FANOUT,
+	SRS_TX_BW,
+	SRS_TX_BW_FANOUT
+} mac_tx_srs_mode_t;
+
+/*
+ * SRS fanout states
+ */
+typedef enum {
+	SRS_FANOUT_UNINIT = 0,
+	SRS_FANOUT_INIT,
+	SRS_FANOUT_REINIT
+} mac_srs_fanout_state_t;
+
+/*
+ * Structure for dls statistics
+ */
+struct dls_kstats {
+	kstat_named_t	dlss_soft_ring_pkt_drop;
+};
+
+extern struct dls_kstats dls_kstat;
+
+#define	DLS_BUMP_STAT(x, y)	(dls_kstat.x.value.ui32 += y)
+
+/* Turn dynamic polling off */
+#define	MAC_SRS_POLLING_OFF(mac_srs) {					\
+	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
+	if (((mac_srs)->srs_state & (SRS_POLLING_CAPAB|SRS_POLLING)) == \
+	    (SRS_POLLING_CAPAB|SRS_POLLING)) {				\
+		(mac_srs)->srs_state &= ~SRS_POLLING;			\
+		(void) mac_hwring_enable_intr((mac_ring_handle_t)	\
+		    (mac_srs)->srs_ring);				\
+		(mac_srs)->srs_rx.sr_poll_off++;			\
+	}								\
+}
+
+#define	MAC_COUNT_CHAIN(mac_srs, head, tail, cnt, sz)	{	\
+	mblk_t 		*tmp;		       			\
+	boolean_t	bw_ctl = B_FALSE;			\
+								\
+	ASSERT((head) != NULL);					\
+	cnt = 0;						\
+	sz = 0;							\
+	if ((mac_srs)->srs_type & SRST_BW_CONTROL)		\
+		bw_ctl = B_TRUE;				\
+	tmp = tail = (head);					\
+	if ((head)->b_next == NULL) {				\
+		cnt = 1;					\
+		if (bw_ctl)					\
+			sz += msgdsize(head);			\
+	} else {						\
+		while (tmp != NULL) {				\
+			tail = tmp;				\
+			cnt++;					\
+			if (bw_ctl)				\
+				sz += msgdsize(tmp);		\
+			tmp = tmp->b_next;			\
+		}						\
+	}							\
+}
+
+/*
+ * Decrement the cumulative packet count in SRS and its
+ * soft rings. If the srs_poll_pkt_cnt goes below lowat, then check
+ * if if the interface was left in a polling mode and no one
+ * is really processing the queue (to get the interface out
+ * of poll mode). If no one is processing the queue, then
+ * acquire the PROC and signal the poll thread to check the
+ * interface for packets and get the interface back to interrupt
+ * mode if nothing is found.
+ */
+#define	MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt) {		        \
+	mac_srs_rx_t	*srs_rx = &(mac_srs)->srs_rx;			\
+	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
+									\
+	srs_rx->sr_poll_pkt_cnt -= cnt;					\
+	if ((srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_poll_thres) && 	\
+		(((mac_srs)->srs_state &				\
+		(SRS_POLLING|SRS_PROC|SRS_GET_PKTS)) == SRS_POLLING))	\
+	{								\
+		(mac_srs)->srs_state |= (SRS_PROC|SRS_GET_PKTS);	\
+		cv_signal(&(mac_srs)->srs_cv); 				\
+		srs_rx->sr_below_hiwat++;				\
+	}								\
+}
+
+/*
+ * The following two macros are used to update the inbound packet and byte.
+ * count. The packet and byte count reflect the packets and bytes that are
+ * taken out of the SRS's queue, i.e. indicating they are being delivered.
+ * The srs_count and srs_size are updated in different locations as the
+ * srs_size is also used to take into account any bandwidth limits. The
+ * srs_size is updated only when a soft ring, if any, sends a packet up,
+ * as opposed to updating it when the SRS sends a packet to the SR, i.e.
+ * the srs_size reflects the packets in the SRS and SRs. These
+ * macros decrement the srs_size and srs_count and also increment the
+ * ipackets and ibytes stats resp.
+ *
+ * xxx-venu These are done under srs_lock, for now we still update
+ * mci_stat_ibytes/mci_stat_ipackets atomically, need to check if
+ * just updating them would be accurate enough.
+ *
+ * If we are updating these for a sub-flow SRS, then we need to also
+ * updated it's MAC client bandwidth info, if the MAC client is also
+ * bandwidth regulated.
+ */
+#define	MAC_UPDATE_SRS_SIZE_LOCKED(srs, sz) {				\
+	if ((srs)->srs_type & SRST_BW_CONTROL) {			\
+		mutex_enter(&(srs)->srs_bw->mac_bw_lock);		\
+		(srs)->srs_bw->mac_bw_sz -= (sz);			\
+		(srs)->srs_bw->mac_bw_used += (sz);			\
+		mutex_exit(&(srs)->srs_bw->mac_bw_lock);		\
+	}								\
+}
+
+#define	MAC_TX_UPDATE_BW_INFO(srs, sz) {				\
+	(srs)->srs_bw->mac_bw_sz -= (sz);				\
+	(srs)->srs_bw->mac_bw_used += (sz);				\
+}
+
+#define	TX_MULTI_RING_MODE(mac_srs)				\
+	((mac_srs)->srs_tx.st_mode == SRS_TX_FANOUT || 		\
+	    (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT)
+
+/* Soft ring flags for teardown */
+#define	SRS_POLL_THR_OWNER	(SRS_PROC | SRS_POLLING | SRS_GET_PKTS)
+#define	SRS_PAUSE		(SRS_CONDEMNED | SRS_QUIESCE)
+#define	S_RING_PAUSE		(S_RING_CONDEMNED | S_RING_QUIESCE)
+
+/* Soft rings */
+extern void mac_soft_ring_init(void);
+extern void mac_soft_ring_finish(void);
+extern void mac_fanout_setup(mac_client_impl_t *, flow_entry_t *,
+    mac_resource_props_t *, mac_direct_rx_t, void *, mac_resource_handle_t);
+
+extern void mac_soft_ring_worker_wakeup(mac_soft_ring_t *);
+extern void mac_soft_ring_blank(void *, time_t, uint_t, int);
+extern mblk_t *mac_soft_ring_poll(mac_soft_ring_t *, int);
+extern void mac_soft_ring_destroy(mac_soft_ring_t *);
+extern void mac_soft_ring_dls_bypass(void *, mac_direct_rx_t, void *);
+
+/* Rx SRS */
+extern mac_soft_ring_set_t *mac_srs_create(struct mac_client_impl_s *,
+    flow_entry_t *, uint32_t, mac_direct_rx_t, void *, mac_resource_handle_t,
+    mac_ring_t *);
+extern void mac_srs_free(mac_soft_ring_set_t *);
+extern void mac_srs_signal(mac_soft_ring_set_t *, uint_t);
+extern cpu_t *mac_srs_bind(mac_soft_ring_set_t *, processorid_t);
+
+extern void mac_srs_change_upcall(void *, mac_direct_rx_t, void *);
+extern void mac_srs_quiesce_initiate(mac_soft_ring_set_t *);
+extern void mac_srs_client_poll_enable(struct mac_client_impl_s *,
+    mac_soft_ring_set_t *);
+extern void mac_srs_client_poll_disable(struct mac_client_impl_s *,
+    mac_soft_ring_set_t *);
+extern void mac_srs_client_poll_quiesce(struct mac_client_impl_s *,
+    mac_soft_ring_set_t *);
+extern void mac_srs_client_poll_restart(struct mac_client_impl_s *,
+    mac_soft_ring_set_t *);
+extern void mac_rx_srs_quiesce(mac_soft_ring_set_t *, uint_t);
+extern void mac_rx_srs_restart(mac_soft_ring_set_t *);
+extern void mac_rx_srs_subflow_process(void *, mac_resource_handle_t, mblk_t *,
+    boolean_t);
+extern void mac_tx_srs_quiesce(mac_soft_ring_set_t *, uint_t);
+
+/* Tx SRS, Tx softring */
+extern void mac_tx_srs_wakeup(mac_soft_ring_set_t *, mac_ring_handle_t);
+extern void mac_tx_srs_setup(struct mac_client_impl_s *,
+    flow_entry_t *, uint32_t);
+extern mac_tx_func_t mac_tx_get_func(uint32_t);
+extern mblk_t *mac_tx_send(mac_client_handle_t, mac_ring_handle_t, mblk_t *,
+    mac_tx_stats_t *);
+extern boolean_t mac_tx_srs_ring_present(mac_soft_ring_set_t *, mac_ring_t *);
+extern void mac_tx_srs_add_ring(mac_soft_ring_set_t *, mac_ring_t *);
+extern void mac_tx_srs_del_ring(mac_soft_ring_set_t *, mac_ring_t *);
+extern mac_tx_cookie_t mac_tx_srs_no_desc(mac_soft_ring_set_t *, mblk_t *,
+    uint16_t, mblk_t **);
+
+/* Subflow specific stuff */
+extern int mac_srs_flow_create(struct mac_client_impl_s *, flow_entry_t *,
+    mac_resource_props_t *, int, int, mac_direct_rx_t);
+extern void mac_srs_update_bwlimit(flow_entry_t *, mac_resource_props_t *);
+extern void mac_srs_adjust_subflow_bwlimit(struct mac_client_impl_s *);
+extern void mac_srs_update_drv(struct mac_client_impl_s *);
+extern void mac_update_srs_priority(mac_soft_ring_set_t *, pri_t);
+extern void mac_client_update_classifier(mac_client_impl_t *, boolean_t);
+
+extern void mac_soft_ring_intr_enable(void *);
+extern void mac_soft_ring_intr_disable(void *);
+extern mac_soft_ring_t *mac_soft_ring_create(int, clock_t, void *, uint16_t,
+    pri_t, mac_client_impl_t *, mac_soft_ring_set_t *,
+    processorid_t, mac_direct_rx_t, void *, mac_resource_handle_t);
+extern cpu_t *mac_soft_ring_bind(mac_soft_ring_t *, processorid_t);
+	extern void mac_soft_ring_unbind(mac_soft_ring_t *);
+extern void mac_soft_ring_free(mac_soft_ring_t *, boolean_t);
+extern void mac_soft_ring_signal(mac_soft_ring_t *, uint_t);
+extern void mac_rx_soft_ring_process(mac_client_impl_t *, mac_soft_ring_t *,
+    mblk_t *, mblk_t *, int, size_t);
+extern mac_tx_cookie_t mac_tx_soft_ring_process(mac_soft_ring_t *,
+    mblk_t *, uint16_t, mblk_t **);
+extern void mac_srs_worker_quiesce(mac_soft_ring_set_t *);
+extern void mac_srs_worker_restart(mac_soft_ring_set_t *);
+extern void mac_rx_attach_flow_srs(mac_impl_t *, flow_entry_t *,
+    mac_soft_ring_set_t *, mac_ring_t *, mac_classify_type_t);
+
+extern void mac_rx_srs_drain_bw(mac_soft_ring_set_t *, uint_t);
+extern void mac_rx_srs_drain(mac_soft_ring_set_t *, uint_t);
+extern void mac_rx_srs_process(void *, mac_resource_handle_t, mblk_t *,
+    boolean_t);
+extern void mac_srs_worker(mac_soft_ring_set_t *);
+extern void mac_rx_srs_poll_ring(mac_soft_ring_set_t *);
+extern void mac_tx_srs_drain(mac_soft_ring_set_t *, uint_t);
+
+extern void mac_tx_srs_restart(mac_soft_ring_set_t *);
+extern void mac_rx_srs_remove(mac_soft_ring_set_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_MAC_SOFT_RING_H */
diff --git a/usr/src/uts/common/sys/modhash.h b/usr/src/uts/common/sys/modhash.h
index 5860ad165a..68d1c4dedd 100644
--- a/usr/src/uts/common/sys/modhash.h
+++ b/usr/src/uts/common/sys/modhash.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_MODHASH_H
 #define	_SYS_MODHASH_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Generic hash implementation for the kernel.
  */
@@ -129,6 +126,8 @@ int mod_hash_destroy(mod_hash_t *, mod_hash_key_t);
 int mod_hash_find(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
 int mod_hash_find_cb(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *,
     void (*)(mod_hash_key_t, mod_hash_val_t));
+int mod_hash_find_cb_rval(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *,
+    int (*)(mod_hash_key_t, mod_hash_val_t), int *);
 void mod_hash_walk(mod_hash_t *,
     uint_t (*)(mod_hash_key_t, mod_hash_val_t *, void *), void *);
 
diff --git a/usr/src/uts/common/sys/nxge/nxge.h b/usr/src/uts/common/sys/nxge/nxge.h
index 37cd6db405..624e433572 100644
--- a/usr/src/uts/common/sys/nxge/nxge.h
+++ b/usr/src/uts/common/sys/nxge/nxge.h
@@ -319,6 +319,7 @@ typedef struct _filter_t {
 	uint32_t all_sap_cnt;
 } filter_t, *p_filter_t;
 
+
 typedef struct _nxge_port_stats_t {
 	/*
 	 *  Overall structure size
@@ -470,6 +471,8 @@ typedef struct _nxge_stats_t {
 
 } nxge_stats_t, *p_nxge_stats_t;
 
+
+
 typedef struct _nxge_intr_t {
 	boolean_t		intr_registered; /* interrupts are registered */
 	boolean_t		intr_enabled; 	/* interrupts are enabled */
@@ -497,7 +500,7 @@ typedef struct _nxge_ldgv_t {
 	p_nxge_ldg_t		ldgp;
 	p_nxge_ldv_t		ldvp;
 	p_nxge_ldv_t		ldvp_syserr;
-	int			ldvp_syserr_allocated;
+	boolean_t		ldvp_syserr_alloced;
 } nxge_ldgv_t, *p_nxge_ldgv_t;
 
 typedef enum {
@@ -542,7 +545,8 @@ typedef struct {
 #define	NXGE_DC_SET(map, channel)	map |= (1 << channel)
 #define	NXGE_DC_RESET(map, channel)	map &= (~(1 << channel))
 
-#define	NXGE_LOGICAL_GROUP_MAX	NXGE_MAX_TDCS
+/* For now, we only support up to 8 RDC/TDC groups */
+#define	NXGE_LOGICAL_GROUP_MAX	NXGE_MAX_RDC_GROUPS
 
 typedef struct {
 	int			sequence; /* To order groups in time. */
@@ -558,6 +562,12 @@ typedef struct {
 } nxge_grp_set_t;
 
 /*
+ * Transmit Ring Group
+ * TX groups will be used exclusively for the purpose of Hybrid I/O.  From
+ * the point of view of the nxge driver, the groups will be software
+ * constructs which will be used to establish the relationship between TX
+ * rings and shares.
+ *
  * Receive Ring Group
  * One of the advanced virtualization features is the ability to bundle
  * multiple Receive Rings in a single group.  One or more MAC addresses may
@@ -567,12 +577,16 @@ typedef struct {
  * RX ring groups can come with a predefined set of member rings, or they
  * are programmable by adding and removing rings to/from them.
  */
-typedef struct _nxge_rx_ring_group_t {
+typedef struct _nxge_ring_group_t {
 	mac_group_handle_t	ghandle;
 	p_nxge_t		nxgep;
+	boolean_t		started;
+	mac_ring_type_t		type;
 	int			gindex;
 	int			sindex;
-} nxge_rx_ring_group_t;
+	int			rdctbl;
+	int			n_mac_addrs;
+} nxge_ring_group_t;
 
 /*
  * Ring Handle
@@ -581,7 +595,7 @@ typedef struct _nxge_ring_handle_t {
 	p_nxge_t		nxgep;
 	int			index;		/* port-wise */
 	mac_ring_handle_t	ring_handle;
-} nxge_ring_handle_t;
+} nxge_ring_handle_t, *p_nxge_ring_handle_t;
 
 /*
  * Share Handle
@@ -613,9 +627,6 @@ struct _nxge_t {
 	uint64_t		nxge_debug_level; /* driver state bit flags */
 	kmutex_t		genlock[1];
 	enum nxge_mac_state	nxge_mac_state;
-	ddi_softintr_t		resched_id;	/* reschedule callback	*/
-	boolean_t		resched_needed;
-	boolean_t		resched_running;
 
 	p_dev_regs_t		dev_regs;
 	npi_handle_t		npi_handle;
@@ -695,17 +706,12 @@ struct _nxge_t {
 	p_rx_rcr_rings_t 	rx_rcr_rings;
 	p_rx_mbox_areas_t 	rx_mbox_areas_p;
 
-	uint32_t		start_rdc;
-	uint32_t		max_rdcs;
 	uint32_t		rdc_mask;
 
 	/* Transmit descriptors rings */
 	p_tx_rings_t 		tx_rings;
 	p_tx_mbox_areas_t	tx_mbox_areas_p;
 
-	uint32_t		start_tdc;
-	uint32_t		max_tdcs;
-
 	ddi_dma_handle_t 	dmasparehandle;
 
 	ulong_t 		sys_page_sz;
@@ -777,7 +783,15 @@ struct _nxge_t {
 	nxge_grp_set_t		tx_set;
 	boolean_t		tdc_is_shared[NXGE_MAX_TDCS];
 
-	nxge_rx_ring_group_t	rx_hio_groups[NXGE_MAX_RDC_GROUPS];
+	boolean_t		rx_channel_started[NXGE_MAX_RDCS];
+
+	/* Ring Handles */
+	nxge_ring_handle_t	tx_ring_handles[NXGE_MAX_TDCS];
+	nxge_ring_handle_t	rx_ring_handles[NXGE_MAX_RDCS];
+
+	nxge_ring_group_t	tx_hio_groups[NXGE_MAX_TDC_GROUPS];
+	nxge_ring_group_t	rx_hio_groups[NXGE_MAX_RDC_GROUPS];
+
 	nxge_share_handle_t	shares[NXGE_MAX_VRS];
 };
 
diff --git a/usr/src/uts/common/sys/nxge/nxge_common.h b/usr/src/uts/common/sys/nxge/nxge_common.h
index f2bbc8e064..7956b5f653 100644
--- a/usr/src/uts/common/sys/nxge/nxge_common.h
+++ b/usr/src/uts/common/sys/nxge/nxge_common.h
@@ -277,15 +277,24 @@ typedef struct  nxge_tdc_cfg {
 #define	RDC_TABLE_ENTRY_METHOD_SEQ	0
 #define	RDC_TABLE_ENTRY_METHOD_REP	1
 
+/* per transmit DMA channel table group data structure */
+typedef struct nxge_tdc_grp {
+	uint32_t	start_tdc;	/* assume assigned in sequence */
+	uint8_t		max_tdcs;
+	dc_map_t	map;
+	uint8_t		grp_index;	/* nxge_t.tx_set.group[grp_index] */
+} nxge_tdc_grp_t, *p_nxge_tdc_grp_t;
+
 /* per receive DMA channel table group data structure */
 typedef struct nxge_rdc_grp {
-	uint32_t	flag;		/* 0: not configured 1: configured */
+	boolean_t	flag;		/* 0: not configured 1: configured */
 	uint8_t		port;
-	uint8_t		start_rdc;	/* assume assigned in sequence	*/
+	uint32_t	start_rdc;	/* assume assigned in sequence	*/
 	uint8_t		max_rdcs;
 	uint8_t		def_rdc;
 	dc_map_t	map;
 	uint16_t	config_method;
+	uint8_t		grp_index;	/* nxge_t.rx_set.group[grp_index] */
 } nxge_rdc_grp_t, *p_nxge_rdc_grp_t;
 
 #define	RDC_MAP_IN(map, rdc) \
@@ -383,7 +392,6 @@ typedef struct nxge_hw_pt_cfg {
 	uint32_t	ser_ldvid;
 	uint32_t	def_rdc;	 /* default RDC			*/
 	uint32_t	drr_wt;		 /* port DRR weight		*/
-	uint32_t	start_grpid;	 /* starting group ID		*/
 	uint32_t	max_grpids;	 /* max group ID		*/
 	uint32_t	grpids[NXGE_MAX_RDCS]; /* RDC group IDs		*/
 	uint32_t	max_rdc_grpids;	 /* max RDC group ID		*/
@@ -393,6 +401,7 @@ typedef struct nxge_hw_pt_cfg {
 	uint32_t	start_mac_entry; /* where to put the first mac	*/
 	uint32_t	max_macs;	 /* the max mac entry allowed	*/
 	uint32_t	mac_pref;	 /* preference over VLAN	*/
+	uint32_t	def_mac_txdma_grpid; /* default TDC group ID	*/
 	uint32_t	def_mac_rxdma_grpid; /* default RDC group ID	*/
 	uint32_t	vlan_pref;	 /* preference over MAC		*/
 
@@ -417,6 +426,9 @@ typedef struct nxge_dma_pt_cfg {
 	 */
 	uint32_t	tx_dma_map;	/* Transmit DMA channel bit map */
 
+	/* Transmit DMA channel: device wise */
+	nxge_tdc_grp_t  tdc_grps[NXGE_MAX_TDC_GROUPS];
+
 	/* Receive DMA channel */
 	nxge_rdc_grp_t	rdc_grps[NXGE_MAX_RDC_GROUPS];
 
diff --git a/usr/src/uts/common/sys/nxge/nxge_defs.h b/usr/src/uts/common/sys/nxge/nxge_defs.h
index db061381da..8f8e226b32 100644
--- a/usr/src/uts/common/sys/nxge/nxge_defs.h
+++ b/usr/src/uts/common/sys/nxge/nxge_defs.h
@@ -278,6 +278,12 @@ extern "C" {
  */
 #define	NXGE_MAX_VRS			8
 
+/*
+ * TDC groups are used exclusively for the purpose of Hybrid I/O
+ * TX needs one group for each VR
+ */
+#define	NXGE_MAX_TDC_GROUPS		(NXGE_MAX_VRS)
+
 /* Max. RDC table groups */
 #define	NXGE_MAX_RDC_GROUPS		8
 #define	NXGE_MAX_RDCS			16
diff --git a/usr/src/uts/common/sys/nxge/nxge_fflp_hw.h b/usr/src/uts/common/sys/nxge/nxge_fflp_hw.h
index fc99701ca3..d7270a6fb1 100644
--- a/usr/src/uts/common/sys/nxge/nxge_fflp_hw.h
+++ b/usr/src/uts/common/sys/nxge/nxge_fflp_hw.h
@@ -18,7 +18,6 @@
  *
  * CDDL HEADER END
  */
-
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -33,6 +32,7 @@ extern "C" {
 
 #include <nxge_defs.h>
 
+
 /* FZC_FFLP Offsets */
 #define	    FFLP_ENET_VLAN_TBL_REG	(FZC_FFLP + 0x00000)
 
@@ -1284,6 +1284,7 @@ typedef struct tcam_entry {
  * before this header file.
  * Need to move these includes to impl files ...
  */
+
 #include <netinet/in.h>
 
 typedef union flow_template {
diff --git a/usr/src/uts/common/sys/nxge/nxge_flow.h b/usr/src/uts/common/sys/nxge/nxge_flow.h
index 352834d796..c76f2731a1 100644
--- a/usr/src/uts/common/sys/nxge/nxge_flow.h
+++ b/usr/src/uts/common/sys/nxge/nxge_flow.h
@@ -18,7 +18,6 @@
  *
  * CDDL HEADER END
  */
-
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
diff --git a/usr/src/uts/common/sys/nxge/nxge_hio.h b/usr/src/uts/common/sys/nxge/nxge_hio.h
index 2a25341111..10487202b6 100644
--- a/usr/src/uts/common/sys/nxge/nxge_hio.h
+++ b/usr/src/uts/common/sys/nxge/nxge_hio.h
@@ -34,7 +34,7 @@ extern "C" {
 #include <nxge_mac.h>
 #include <nxge_ipp.h>
 #include <nxge_fflp.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #if defined(sun4v)
 #include <sys/vnet_res.h>
 #endif
@@ -249,9 +249,10 @@ typedef struct nxge_hio_vr {
 	size_t		size;
 	vr_region_t	region;	/* 1 of 8 regions. */
 
-	uint8_t		rdc_tbl; /* 1 of 8 RDC tables. */
+	int		rdc_tbl; /* 1 of 8 RDC tables. */
+	int		tdc_tbl; /* 1 of 8 TDC tables. */
 	ether_addr_t	altmac;	/* The alternate MAC address. */
-	mac_addr_slot_t	slot;	/* According to nxge_m_mmac_add(). */
+	int		slot;	/* According to nxge_m_mmac_add(). */
 
 #if defined(sun4v)
 	vio_net_handle_t vhp;	/* The handle given to us by the vnet. */
@@ -369,12 +370,18 @@ extern const char *nxge_ddi_perror(int);
  */
 extern void nxge_hio_group_get(void *arg, mac_ring_type_t type, int group,
 	mac_group_info_t *infop, mac_group_handle_t ghdl);
-extern int nxge_hio_share_alloc(void *arg, uint64_t cookie, uint64_t *rcookie,
-	mac_share_handle_t *shandle);
+extern int nxge_hio_share_alloc(void *arg, mac_share_handle_t *shandle);
 extern void nxge_hio_share_free(mac_share_handle_t shandle);
 extern void nxge_hio_share_query(mac_share_handle_t shandle,
-	mac_ring_type_t type, uint32_t *rmin, uint32_t *rmax, uint64_t *rmap,
-	uint64_t *gnum);
+	mac_ring_type_t type, mac_ring_handle_t *rings, uint_t *n_rings);
+extern int nxge_hio_share_add_group(mac_share_handle_t,
+    mac_group_driver_t);
+extern int nxge_hio_share_rem_group(mac_share_handle_t,
+    mac_group_driver_t);
+extern int nxge_hio_share_bind(mac_share_handle_t, uint64_t cookie,
+    uint64_t *rcookie);
+extern void nxge_hio_share_unbind(mac_share_handle_t);
+
 
 				/* nxge_hio_guest.c */
 extern void nxge_hio_unregister(nxge_t *);
@@ -416,12 +423,6 @@ extern int nxge_hio_hostinfo_get_rdc_table(p_nxge_t);
 extern int nxge_hio_hostinfo_init(nxge_t *, nxge_hio_vr_t *, ether_addr_t *);
 extern void nxge_hio_hostinfo_uninit(nxge_t *, nxge_hio_vr_t *);
 
-				/* nxge_rxdma.c */
-extern nxge_status_t nxge_rx_poll(nxge_t *, int);
-
-				/* nxge_txdma.c */
-extern uint_t nxge_tx_poll(nxge_t *, int);
-
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/sys/nxge/nxge_impl.h b/usr/src/uts/common/sys/nxge/nxge_impl.h
index 5420ac00bb..63779b4e88 100644
--- a/usr/src/uts/common/sys/nxge/nxge_impl.h
+++ b/usr/src/uts/common/sys/nxge/nxge_impl.h
@@ -36,6 +36,8 @@ extern "C" {
 #define	NIU_MAJOR_VER		1
 #define	NIU_MINOR_VER		1
 
+#if defined(sun4v)
+
 /*
  * NIU HV API v1.0 definitions
  */
@@ -44,6 +46,8 @@ extern "C" {
 #define	N2NIU_TX_LP_CONF		0x144
 #define	N2NIU_TX_LP_INFO		0x145
 
+#endif /* defined(sun4v) */
+
 #ifndef _ASM
 
 #include	<sys/types.h>
@@ -81,8 +85,7 @@ extern "C" {
 #include	<sys/netlb.h>
 
 #include	<sys/ddi_intr.h>
-#include 	<sys/mac.h>
-#include	<sys/mac_impl.h>
+#include 	<sys/mac_provider.h>
 #include	<sys/mac_ether.h>
 
 #if	defined(sun4v)
@@ -611,7 +614,6 @@ struct _nxge_ldg_t {
 	uint8_t			ldg;		/* logical group number */
 	uint8_t			vldg_index;
 	boolean_t		arm;
-	boolean_t		interrupted;
 	uint16_t		ldg_timer;	/* counter */
 	uint8_t			func;
 	uint8_t			vector;
@@ -749,6 +751,13 @@ typedef struct _nxge_mmac_stats_t {
 	struct ether_addr mmac_avail_pool[16];
 } nxge_mmac_stats_t, *p_nxge_mmac_stats_t;
 
+/*
+ * Copied from mac.h. Should be cleaned up by driver.
+ */
+#define	MMAC_SLOT_USED		0x1   /* address slot used */
+#define	MMAC_VENDOR_ADDR	0x2   /* address returned is vendor supplied */
+
+
 #define	NXGE_MAX_MMAC_ADDRS	32
 #define	NXGE_NUM_MMAC_ADDRS	8
 #define	NXGE_NUM_OF_PORTS_QUAD	4
@@ -885,6 +894,8 @@ void nxge_hw_set_mac_modes(p_nxge_t);
 
 /* nxge_send.c. */
 uint_t nxge_reschedule(caddr_t);
+mblk_t *nxge_tx_ring_send(void *, mblk_t *);
+int nxge_start(p_nxge_t, p_tx_ring_t, p_mblk_t);
 
 /* nxge_rxdma.c */
 nxge_status_t nxge_rxdma_cfg_rdcgrp_default_rdc(p_nxge_t,
@@ -1050,6 +1061,8 @@ int nxge_get_nports(p_nxge_t);
 
 void nxge_free_buf(buf_alloc_type_t, uint64_t, uint32_t);
 
+#if defined(sun4v)
+
 uint64_t hv_niu_rx_logical_page_conf(uint64_t, uint64_t,
 	uint64_t, uint64_t);
 #pragma weak	hv_niu_rx_logical_page_conf
@@ -1131,6 +1144,8 @@ uint64_t hv_niu_vrtx_to_logical_dev(uint32_t cookie, uint64_t v_chidx,
     uint64_t *ldn);
 #pragma weak	hv_niu_vrtx_to_logical_dev
 
+#endif /* defined(sun4v) */
+
 #ifdef NXGE_DEBUG
 char *nxge_dump_packet(char *, int);
 #endif
diff --git a/usr/src/uts/common/sys/nxge/nxge_rxdma.h b/usr/src/uts/common/sys/nxge/nxge_rxdma.h
index 43a7185148..a336dbb9cb 100644
--- a/usr/src/uts/common/sys/nxge/nxge_rxdma.h
+++ b/usr/src/uts/common/sys/nxge/nxge_rxdma.h
@@ -155,6 +155,13 @@ typedef struct _nxge_rdc_sys_stats {
 	uint32_t	zcp_eop_err;
 } nxge_rdc_sys_stats_t, *p_nxge_rdc_sys_stats_t;
 
+/*
+ * Software reserved buffer offset
+ */
+typedef struct _nxge_rxbuf_off_hdr_t {
+	uint32_t		index;
+} nxge_rxbuf_off_hdr_t, *p_nxge_rxbuf_off_hdr_t;
+
 
 typedef struct _rx_msg_t {
 	nxge_os_dma_common_t	buf_dma;
@@ -231,8 +238,11 @@ typedef struct _rx_rcr_ring_t {
 	uint32_t		intr_timeout;
 	uint32_t		intr_threshold;
 	uint64_t		max_receive_pkts;
-	mac_resource_handle_t	rcr_mac_handle;
+	mac_ring_handle_t	rcr_mac_handle;
+	uint64_t		rcr_gen_num;
 	uint32_t		rcvd_pkt_bytes; /* Received bytes of a packet */
+	p_nxge_ldv_t		ldvp;
+	p_nxge_ldg_t		ldgp;
 } rx_rcr_ring_t, *p_rx_rcr_ring_t;
 
 
@@ -359,11 +369,13 @@ typedef struct _rx_mbox_t {
 typedef struct _rx_rbr_rings_t {
 	p_rx_rbr_ring_t 	*rbr_rings;
 	uint32_t		ndmas;
+	boolean_t		rxbuf_allocated;
 } rx_rbr_rings_t, *p_rx_rbr_rings_t;
 
 typedef struct _rx_rcr_rings_t {
 	p_rx_rcr_ring_t 	*rcr_rings;
 	uint32_t		ndmas;
+	boolean_t		cntl_buf_allocated;
 } rx_rcr_rings_t, *p_rx_rcr_rings_t;
 
 typedef struct _rx_mbox_areas_t {
@@ -414,6 +426,10 @@ void nxge_rxdma_fix_channel(p_nxge_t, uint16_t);
 void nxge_rxdma_fixup_channel(p_nxge_t, uint16_t, int);
 int nxge_rxdma_get_ring_index(p_nxge_t, uint16_t);
 
+mblk_t *nxge_rx_poll(void *, int);
+int nxge_enable_poll(void *);
+int nxge_disable_poll(void *);
+
 void nxge_rxdma_regs_dump_channels(p_nxge_t);
 nxge_status_t nxge_rxdma_handle_sys_errors(p_nxge_t);
 void nxge_rxdma_inject_err(p_nxge_t, uint32_t, uint8_t);
@@ -422,6 +438,8 @@ extern nxge_status_t nxge_alloc_rx_mem_pool(p_nxge_t);
 extern nxge_status_t nxge_alloc_rxb(p_nxge_t nxgep, int channel);
 extern void nxge_free_rxb(p_nxge_t nxgep, int channel);
 
+int nxge_get_rxring_index(p_nxge_t, int, int);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/sys/nxge/nxge_serialize.h b/usr/src/uts/common/sys/nxge/nxge_serialize.h
deleted file mode 100644
index f235de7b2e..0000000000
--- a/usr/src/uts/common/sys/nxge/nxge_serialize.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_NXGE_NXGE_SERIALIZE_H
-#define	_SYS_NXGE_NXGE_SERIALIZE_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	NXGE_TX_AVG_CNT		200000000
-#define	NXGE_TX_AVG_RES		2000		/* sleep at least a tick */
-#define	MAXHRS			3		/* # of packets to process */
-#define	ONESEC			1000000000	/* one second */
-
-#include <sys/stream.h>
-#include <sys/mutex.h>
-#include <sys/condvar.h>
-#include <sys/kmem.h>
-#include <sys/ddi.h>
-#include <sys/callb.h>
-
-/*
- * Thread state flags
- */
-#define	NXGE_TX_STHREAD_RUNNING	0x0001	/* thread started */
-#define	NXGE_TX_STHREAD_DESTROY	0x0002	/* thread is being destroyed */
-#define	NXGE_TX_STHREAD_EXIT	0x0003	/* thread exits */
-
-typedef int (onetrack_t)(mblk_t *, void *);
-
-typedef struct {
-	kmutex_t	lock;
-	int		count;
-	mblk_t		*head;
-	mblk_t		*tail;
-	void		*cookie;
-	onetrack_t	*serialop;
-	int		owned;
-	/* Counter tracks the total time spent in serializer function */
-	hrtime_t	totaltime;
-	/*
-	 * Counter tracks the total number of time the serializer
-	 * function was called.
-	 */
-	long		totalcount;
-	/*
-	 * Counter maintains the average time spent in the serializer function
-	 * and is derived as (totaltime/totalcount).
-	 */
-	int		avg;
-	/*
-	 * The lenght of the queue to which the serializer function
-	 * will append data.
-	 */
-	int		length;
-	kcondvar_t	serial_cv;
-	kcondvar_t	timecv;
-	kmutex_t	serial;
-	uint32_t	s_state;
-	boolean_t	s_need_signal;
-	callb_cpr_t 	s_cprinfo;
-	kthread_t 	*tx_sthread;
-	kmutex_t	timelock;
-} nxge_serialize_t;
-
-/*
- * Prototypes definitions
- */
-nxge_serialize_t *nxge_serialize_create(int, onetrack_t *, void *);
-void nxge_serialize_destroy(nxge_serialize_t *);
-void nxge_serialize_enter(nxge_serialize_t *, mblk_t *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_NXGE_NXGE_SERIALIZE_H */
diff --git a/usr/src/uts/common/sys/nxge/nxge_txdma.h b/usr/src/uts/common/sys/nxge/nxge_txdma.h
index 859f6a124e..829d67ebce 100644
--- a/usr/src/uts/common/sys/nxge/nxge_txdma.h
+++ b/usr/src/uts/common/sys/nxge/nxge_txdma.h
@@ -18,7 +18,6 @@
  *
  * CDDL HEADER END
  */
-
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -31,9 +30,9 @@
 extern "C" {
 #endif
 
+#include <sys/taskq.h>
 #include <sys/nxge/nxge_txdma_hw.h>
 #include <npi_txdma.h>
-#include <sys/nxge/nxge_serialize.h>
 
 #define	TXDMA_PORT_BITMAP(nxgep)		(nxgep->pt_config.tx_dma_map)
 
@@ -152,14 +151,13 @@ typedef struct _tx_ring_t {
 	uint32_t		tx_ring_offline;
 	boolean_t		tx_ring_busy;
 
-	p_tx_msg_t		tx_free_list_p;
-	nxge_os_mutex_t		freelock;
-
 	nxge_os_mutex_t		lock;
+	mac_ring_handle_t	tx_ring_handle;
+	ddi_taskq_t		*taskq;
 	uint16_t 		index;
 	uint16_t		tdc;
 	struct nxge_tdc_cfg	*tdc_p;
-	uint_t 			tx_ring_size;
+	int 			tx_ring_size;
 	uint32_t 		num_chunks;
 
 	uint_t 			tx_wrap_mask;
@@ -170,11 +168,10 @@ typedef struct _tx_ring_t {
 	tx_ring_kick_t		ring_kick_tail;
 	txdma_mailbox_t		tx_mbox;
 
-	uint_t 			descs_pending;
+	int 			descs_pending;
 	boolean_t 		queueing;
 
 	nxge_os_mutex_t		sq_lock;
-	nxge_serialize_t 	*serial;
 	p_mblk_t 		head;
 	p_mblk_t 		tail;
 
diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h
index 2591642dc0..8d93c7780e 100644
--- a/usr/src/uts/common/sys/policy.h
+++ b/usr/src/uts/common/sys/policy.h
@@ -161,6 +161,7 @@ void secpolicy_fs_mount_clearopts(cred_t *, struct vfs *);
 int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *,
     const vattr_t *, cred_t *);
 int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t);
+int secpolicy_dld_ioctl(const cred_t *, const char *, const char *);
 int secpolicy_xvm_control(const cred_t *);
 
 int secpolicy_basic_exec(const cred_t *, vnode_t *);
diff --git a/usr/src/uts/common/sys/softmac_impl.h b/usr/src/uts/common/sys/softmac_impl.h
index 3fcfc97415..5f9d1401a7 100644
--- a/usr/src/uts/common/sys/softmac_impl.h
+++ b/usr/src/uts/common/sys/softmac_impl.h
@@ -26,8 +26,6 @@
 #ifndef	_SYS_SOFTMAC_IMPL_H
 #define	_SYS_SOFTMAC_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/ethernet.h>
 #include <sys/taskq.h>
@@ -37,6 +35,9 @@
 #include <sys/stream.h>
 #include <sys/dlpi.h>
 #include <sys/mac.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
 #include <sys/mac_ether.h>
 
 #ifdef	__cplusplus
@@ -68,14 +69,20 @@ typedef struct softmac_lower_s {
 	boolean_t		sl_pending_ioctl;
 	mblk_t			*sl_ack_mp;
 
-	mac_resource_handle_t	sl_handle;
 	ldi_handle_t		sl_lh;
 } softmac_lower_t;
 
-enum softmac_state {
+typedef enum {
 	SOFTMAC_INITIALIZED,
 	SOFTMAC_READY
-};
+} softmac_lower_state_t;
+
+typedef enum {
+	SOFTMAC_UNINIT,
+	SOFTMAC_ATTACH_INPROG,
+	SOFTMAC_ATTACH_DONE,
+	SOFTMAC_DETACH_INPROG,
+} softmac_state_t;
 
 typedef struct softmac_dev_s {
 	dev_t	sd_dev;
@@ -86,8 +93,12 @@ typedef struct softmac_dev_s {
  */
 #define	SOFTMAC_GLDV3		0x01
 #define	SOFTMAC_NOSUPP		0x02
-#define	SOFTMAC_ATTACH_DONE	0x04
-#define	SOFTMAC_NEED_RECREATE	0x08
+#define	SOFTMAC_NEED_RECREATE	0x04
+#define	SOFTMAC_NOTIFY_QUIT	0x08
+
+#define	SMAC_NONZERO_NODECNT(softmac)		\
+	((softmac->smac_softmac[0] != NULL) +	\
+	(softmac->smac_softmac[1] != NULL))
 
 /*
  * The softmac structure allows all minor nodes (at most two, style-1 and
@@ -111,18 +122,14 @@ typedef struct softmac {
 	uint32_t	smac_cnt;	/* # of minor nodes for this device */
 
 	/*
-	 * The following fields are protected by softmac_hash_lock.
-	 */
-	/*
+	 * The following fields are protected by smac_mutex.
+	 *
 	 * The smac_hold_cnt field increases when softmac_hold_device() is
 	 * called to force the dls_vlan_t of the device to be created.  The
 	 * device pre-detach fails if this counter is not 0.
 	 */
+	softmac_state_t	smac_state;
 	uint32_t	smac_hold_cnt;
-
-	/*
-	 * The following fields are protected by smac_lock.
-	 */
 	kmutex_t	smac_mutex;
 	kcondvar_t	smac_cv;
 	uint32_t	smac_flags;
@@ -145,6 +152,16 @@ typedef struct softmac {
 	uint32_t	smac_attached_left;
 
 	/*
+	 * Thread handles the DL_NOTIFY_IND message from the lower stream.
+	 */
+	kthread_t	*smac_notify_thread;
+	/*
+	 * Head and tail of the DL_NOTIFY_IND messsages.
+	 */
+	mblk_t		*smac_notify_head;
+	mblk_t		*smac_notify_tail;
+
+	/*
 	 * The remaining fields are used to register the MAC for a legacy
 	 * device.  They are set in softmac_mac_register() and do not change.
 	 * One can access them when mac_register() is done without locks.
@@ -177,11 +194,8 @@ typedef struct softmac {
 	dl_capab_mdt_t	smac_mdt_capab;
 	boolean_t	smac_mdt;
 
-	/*
-	 * The following fields are protected by smac_lock
-	 */
-	krwlock_t	smac_lock;
-	enum softmac_state	smac_state;
+	/* Following fields protected by the mac perimeter */
+	softmac_lower_state_t	smac_lower_state;
 	/* Lower stream structure */
 	softmac_lower_t	*smac_lower;
 } softmac_t;
@@ -193,9 +207,6 @@ typedef struct smac_ioc_start_s {
 #define	SMAC_IOC	('S' << 24 | 'M' << 16 | 'C' << 8)
 #define	SMAC_IOC_START	(SMAC_IOC | 0x01)
 
-#define	SOFTMAC_BLANK_TICKS	128
-#define	SOFTMAC_BLANK_PKT_COUNT	8
-
 extern dev_info_t		*softmac_dip;
 #define	SOFTMAC_DEV_NAME	"softmac"
 
@@ -217,9 +228,9 @@ extern int	softmac_m_unicst(void *, const uint8_t *);
 extern void	softmac_m_ioctl(void *, queue_t *, mblk_t *);
 extern int	softmac_m_stat(void *, uint_t, uint64_t *);
 extern mblk_t	*softmac_m_tx(void *, mblk_t *);
-extern void	softmac_m_resources(void *);
 extern int	softmac_proto_tx(softmac_lower_t *, mblk_t *, mblk_t **);
 extern void	softmac_ioctl_tx(softmac_lower_t *, mblk_t *, mblk_t **);
+extern void	softmac_notify_thread(void *);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h
index 64e52ba808..ec09b3a88b 100644
--- a/usr/src/uts/common/sys/squeue.h
+++ b/usr/src/uts/common/sys/squeue.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_SQUEUE_H
 #define	_SYS_SQUEUE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -47,7 +44,30 @@ typedef struct squeue_s squeue_t;
 	(mp)->b_prev = (mblk_t *)(arg);				\
 }
 
-#define	GET_SQUEUE(mp)		((conn_t *)((mp)->b_prev))->conn_sqp
+#define	GET_SQUEUE(mp)	((conn_t *)((mp)->b_prev))->conn_sqp
+
+#define	SQ_FILL		0x0001
+#define	SQ_NODRAIN	0x0002
+#define	SQ_PROCESS	0x0004
+
+#define	SQUEUE_ENTER(sqp, head, tail, cnt, flag, tag) {	\
+	sqp->sq_enter(sqp, head, tail, cnt, flag, tag);	\
+}
+
+#define	SQUEUE_ENTER_ONE(sqp, mp, proc, arg, flag, tag) {	\
+	ASSERT(mp->b_next == NULL);				\
+	ASSERT(mp->b_prev == NULL);				\
+	SET_SQUEUE(mp, proc, arg);				\
+	SQUEUE_ENTER(sqp, mp, mp, 1, flag, tag);		\
+}
+
+/*
+ * May be called only by a thread executing in the squeue. The thread must
+ * not continue to execute any code needing squeue protection after calling
+ * this macro. Please see the comments in squeue.c for more details.
+ */
+#define	SQUEUE_SWITCH(connp, new_sqp)				\
+	(connp)->conn_sqp = new_sqp;
 
 /*
  * Facility-special private data in squeues.
@@ -57,26 +77,13 @@ typedef enum {
 	SQPRIVATE_MAX
 } sqprivate_t;
 
-typedef void (*sqproc_t)(void *, mblk_t *, void *);
-
 extern void squeue_init(void);
-extern squeue_t *squeue_create(char *, processorid_t, clock_t, pri_t);
+extern squeue_t *squeue_create(clock_t, pri_t);
 extern void squeue_bind(squeue_t *, processorid_t);
 extern void squeue_unbind(squeue_t *);
-extern void squeue_enter_chain(squeue_t *, mblk_t *, mblk_t *,
-    uint32_t, uint8_t);
-extern void squeue_enter(squeue_t *, mblk_t *, sqproc_t, void *, uint8_t);
-extern void squeue_enter_nodrain(squeue_t *, mblk_t *, sqproc_t, void *,
-    uint8_t);
-extern void squeue_fill(squeue_t *, mblk_t *, sqproc_t, void *, uint8_t);
+extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *,
+    uint32_t, int, uint8_t);
 extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t);
-extern processorid_t squeue_binding(squeue_t *);
-
-extern void squeue_profile_reset(squeue_t *);
-extern void squeue_profile_enable(squeue_t *);
-extern void squeue_profile_disable(squeue_t *);
-extern void squeue_profile_stop(void);
-extern void squeue_profile_start(void);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h
index 54870c067c..501377e53f 100644
--- a/usr/src/uts/common/sys/squeue_impl.h
+++ b/usr/src/uts/common/sys/squeue_impl.h
@@ -19,20 +19,21 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_SQUEUE_IMPL_H
 #define	_SYS_SQUEUE_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
+#include <sys/disp.h>
+#include <sys/types.h>
 #include <sys/squeue.h>
+#include <inet/ip.h>
 
 #define	SQ_NAMELEN 31
 
@@ -55,6 +56,8 @@ extern "C" {
 #define	SQUEUE_PROFILE 0
 #endif
 
+#define	SQUEUE_DEFAULT_PRIORITY	MAXCLSYSPRI
+
 typedef struct sqstat_s {
 	uint_t		sq_max_qlen;
 	uint_t		sq_npackets_worker;
@@ -70,60 +73,102 @@ typedef struct sqstat_s {
 	hrtime_t	sq_time_other;
 } sqstat_t;
 
+typedef struct squeue_set_s {
+	squeue_t	*sqs_head;
+	squeue_t	*sqs_default;
+	processorid_t	sqs_cpuid;
+} squeue_set_t;
+
+typedef void (*sqproc_t)(void *, mblk_t *, void *);
+typedef void (*sq_enter_proc_t)(squeue_t *, mblk_t *, mblk_t *, uint32_t,
+		int, uint8_t);
+typedef void (*sq_drain_proc_t)(squeue_t *, uint_t, hrtime_t);
+
+extern void squeue_worker_wakeup(squeue_t *);
+extern int ip_squeue_flag;
+
 struct squeue_s {
-	/* Keep the most used members 64bytes cache aligned */
+	sq_enter_proc_t	sq_enter;	/* sq_process function */
+	sq_drain_proc_t	sq_drain;	/* sq_drain function */
 	kmutex_t	sq_lock;	/* lock before using any member */
 	uint32_t	sq_state;	/* state flags and message count */
 	int		sq_count;	/* # of mblocks in squeue */
 	mblk_t		*sq_first;	/* first mblk chain or NULL */
 	mblk_t		*sq_last;	/* last mblk chain or NULL */
-	clock_t		sq_awaken;	/* time async thread was awakened */
 	kthread_t	*sq_run;	/* Current thread processing sq */
-	void		*sq_rx_ring;
-	clock_t		sq_avg_drain_time; /* Avg time to drain a pkt */
+	ill_rx_ring_t	*sq_rx_ring;	/* The Rx ring tied to this sq */
+	ill_t		*sq_ill;	/* The ill this squeue is tied to */
 
-	processorid_t	sq_bind;	/* processor to bind to */
-	kcondvar_t	sq_async;	/* async thread blocks on */
+	clock_t		sq_curr_time;	/* Current tick (lbolt) */
+	kcondvar_t	sq_worker_cv;	/* cond var. worker thread blocks on */
+	kcondvar_t	sq_poll_cv;	/* cond variable poll_thr waits on */
+	kcondvar_t	sq_ctrlop_done_cv; /* cond variable for ctrl ops */
 	clock_t		sq_wait;	/* lbolts to wait after a fill() */
-	uintptr_t	sq_private[SQPRIVATE_MAX];
 	timeout_id_t	sq_tid;		/* timer id of pending timeout() */
+	clock_t		sq_awaken;	/* time async thread was awakened */
+
+	processorid_t	sq_bind;	/* processor to bind to */
 	kthread_t	*sq_worker;	/* kernel thread id */
-	char		sq_name[SQ_NAMELEN + 1];
+	kthread_t	*sq_poll_thr;	/* polling thread */
+	uintptr_t	sq_private[SQPRIVATE_MAX];
+
+	squeue_t	*sq_next;	/* managed by squeue creator */
+	squeue_set_t	*sq_set;	/* managed by squeue creator */
 
-#if SQUEUE_DEBUG
-	/* Debug-only fields */
+	pri_t		sq_priority;	/* squeue thread priority */
+
+	/* Keep the debug-only fields at the end of the structure */
+#ifdef DEBUG
 	int		sq_isintr;	/* serviced by interrupt */
 	mblk_t		*sq_curmp;
 	void		(*sq_curproc)();
 	conn_t		*sq_connp;
 	uchar_t		sq_tag;
 #endif
-
-#if SQUEUE_PROFILE
-	/* Profiling fields */
-	kstat_t		*sq_kstat;	/* exported statistics */
-	sqstat_t	sq_stats;
-#endif
 };
 
 /*
  * State flags.
  * Note: The MDB IP module depends on the values of these flags.
  */
-#define	SQS_PROC	0x0001	/* being processed */
-#define	SQS_WORKER	0x0002	/* worker thread */
-#define	SQS_ENTER	0x0004	/* enter thread */
-#define	SQS_FAST	0x0008	/* enter-fast thread */
-#define	SQS_USER	0x0010	/* A non interrupt user */
-#define	SQS_BOUND	0x0020	/* Worker thread is bound */
-#define	SQS_PROFILE	0x0040	/* Enable profiling */
-#define	SQS_REENTER	0x0080	/* Re entered thread */
-#define	SQS_TMO_PROG	0x0100	/* Timeout is being set */
-#define	SQS_POLL_CAPAB	0x0200	/* Squeue can control interrupts */
-#define	SQS_NO_INTR	0x0400	/* Interrupts currently disabled */
-#define	SQS_ILL_BOUND	0x0800	/* Squeue bound to an ill */
-#define	SQS_GET_PKTS	0x1000	/* Moving pkts from NIC in progress */
-#define	SQS_DEFAULT	0x2000	/* The default squeue for the CPU */
+#define	SQS_PROC	0x00000001	/* being processed */
+#define	SQS_WORKER	0x00000002	/* worker thread */
+#define	SQS_ENTER	0x00000004	/* enter thread */
+#define	SQS_FAST	0x00000008	/* enter-fast thread */
+
+#define	SQS_USER	0x00000010	/* A non interrupt user */
+#define	SQS_BOUND	0x00000020	/* Worker thread is bound */
+#define	SQS_REENTER	0x00000040	/* Re entered thread */
+#define	SQS_TMO_PROG	0x00000080	/* Timeout is being set */
+
+#define	SQS_POLL_CAPAB	0x00000100	/* Squeue can control interrupts */
+#define	SQS_ILL_BOUND	0x00000200	/* Squeue bound to an ill */
+#define	SQS_GET_PKTS	0x00000400	/* Moving pkts from NIC in progress */
+#define	SQS_DEFAULT	0x00000800	/* The default squeue for the CPU */
+
+#define	SQS_POLLING	0x00001000	/* Squeue in polling mode */
+#define	SQS_INTR_BLANK	0x00002000	/* Interrupt blanking capability */
+#define	SQS_PROC_HELD	0x00004000	/* SQS_PROC is held by the caller */
+#define	SQS_FORCE_TIMER	0x00008000	/* Schedule worker due to B/W control */
+
+#define	SQS_POLL_CLEANUP	0x00010000
+#define	SQS_POLL_CLEANUP_DONE	0x00020000
+#define	SQS_POLL_QUIESCE	0x00040000
+#define	SQS_POLL_QUIESCE_DONE	0x00080000
+
+#define	SQS_POLL_RESTART	0x00100000
+#define	SQS_POLL_THR_QUIESCED	0x00200000
+#define	SQS_POLL_THR_RESTART	0x00400000
+#define	SQS_POLL_PROC		0x00800000 /* Poll thread processing the sq */
+
+#define	SQS_POLL_RESTART_DONE	0x01000000
+#define	SQS_POLL_THR_QUIESCE	0x02000000
+
+#define	SQS_WORKER_THR_CONTROL          \
+	(SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP)
+
+#define	SQS_POLL_THR_CONTROL            \
+	(SQS_POLL_THR_QUIESCE | SQS_POLL_THR_RESTART)
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h
index 6436c5a0cc..41097cab7f 100644
--- a/usr/src/uts/common/sys/stream.h
+++ b/usr/src/uts/common/sys/stream.h
@@ -30,8 +30,6 @@
 #ifndef _SYS_STREAM_H
 #define	_SYS_STREAM_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"	/* SVr4.0 11.44	*/
-
 /*
  * For source compatibility
  */
@@ -414,6 +412,7 @@ typedef	struct	bcache {
 #define	STRUIO_ZCNOTIFY	0x10	/* notify stream head when mblk acked */
 #define	STRUIO_EAGER	0x20	/* new eager; db_cksumstart has squeue to use */
 #define	STRUIO_POLICY	0x40	/* new eager when IPsec is enabled */
+#define	STRUIO_CONNECT	0x80	/* conn did a connect */
 
 /*
  * Message flags.  These are interpreted by the stream head.
diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h
index 401e69dc5e..04c778feaa 100644
--- a/usr/src/uts/common/sys/strsubr.h
+++ b/usr/src/uts/common/sys/strsubr.h
@@ -30,8 +30,6 @@
 #ifndef _SYS_STRSUBR_H
 #define	_SYS_STRSUBR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"	/* SVr4.0 1.17 */
-
 /*
  * WARNING:
  * Everything in this file is private, belonging to the
@@ -1238,6 +1236,8 @@ extern int hcksum_assoc(mblk_t *, struct multidata_s *, struct pdesc_s  *,
     uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, int);
 extern void hcksum_retrieve(mblk_t *, struct multidata_s *, struct pdesc_s *,
     uint32_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *);
+extern void lso_info_set(mblk_t *, uint32_t, uint32_t);
+extern void lso_info_get(mblk_t *, uint32_t *, uint32_t *);
 extern unsigned int bcksum(uchar_t *, int, unsigned int);
 extern boolean_t is_vmloaned_mblk(mblk_t *, struct multidata_s *,
     struct pdesc_s *);
diff --git a/usr/src/uts/common/sys/vlan.h b/usr/src/uts/common/sys/vlan.h
index 2a4e4c8ef0..11c7d41e83 100644
--- a/usr/src/uts/common/sys/vlan.h
+++ b/usr/src/uts/common/sys/vlan.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -30,14 +30,14 @@
 #ifndef	_SYS_VLAN_H
 #define	_SYS_VLAN_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #define	VLAN_TAGSZ	4
 
+#define	VLAN_TPID	0x8100u
+
 #define	VLAN_ID_MASK	0x0fffu
 #define	VLAN_ID_SIZE	12
 #define	VLAN_ID_SHIFT	0
diff --git a/usr/src/uts/common/sys/vnic.h b/usr/src/uts/common/sys/vnic.h
index d17da6bf44..37f962e2ff 100644
--- a/usr/src/uts/common/sys/vnic.h
+++ b/usr/src/uts/common/sys/vnic.h
@@ -30,35 +30,101 @@
 #include <sys/ethernet.h>
 #include <sys/param.h>
 #include <sys/mac.h>
+#include <sys/mac_flow.h>
 #include <sys/dld_ioc.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
- * Note that the datastructures defined here define an ioctl interface
- * that is shared betwen user and kernel space.  The vnic driver thus
- * assumes that the structures have identical layout and size when
- * compiled in either IPL32 or LP64.
+ * Extended diagnostic codes that can be returned by the various
  */
+typedef enum {
+	VNIC_IOC_DIAG_NONE,
+	VNIC_IOC_DIAG_MACADDR_NIC,
+	VNIC_IOC_DIAG_MACADDR_INUSE,
+	VNIC_IOC_DIAG_MACADDR_INVALID,
+	VNIC_IOC_DIAG_MACADDRLEN_INVALID,
+	VNIC_IOC_DIAG_MACFACTORYSLOTINVALID,
+	VNIC_IOC_DIAG_MACFACTORYSLOTUSED,
+	VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED,
+	VNIC_IOC_DIAG_MACFACTORYNOTSUP,
+	VNIC_IOC_DIAG_MACPREFIX_INVALID,
+	VNIC_IOC_DIAG_MACPREFIXLEN_INVALID,
+	VNIC_IOC_DIAG_MACMARGIN_INVALID,
+	VNIC_IOC_DIAG_NO_HWRINGS
+} vnic_ioc_diag_t;
 
 /*
- * For now, we support only MAC addresses specified by value.
+ * Allowed VNIC MAC address types.
+ *
+ * - VNIC_MAC_ADDR_TYPE_FIXED, VNIC_MAC_ADDR_TYPE_RANDOM:
+ *   The MAC address is specified by value by the caller, which
+ *   itself can obtain it from the user directly,
+ *   or pick it in a random fashion. Which method is used by the
+ *   caller is irrelevant to the VNIC driver. However two different
+ *   types are provided so that the information can be made available
+ *   back to user-space when listing the kernel defined VNICs.
+ *
+ *   When a VNIC is created, the address in passed through the
+ *   vc_mac_addr and vc_mac_len fields of the vnic_ioc_create_t
+ *   structure.
+ *
+ * - VNIC_MAC_ADDR_TYPE_FACTORY: the MAC address is obtained from
+ *   one of the MAC factory MAC addresses of the underyling NIC.
+ *
+ * - VNIC_MAC_ADDR_TYPE_AUTO: the VNIC driver attempts to
+ *   obtain the address from one of the factory MAC addresses of
+ *   the underlying NIC. If none is available, the specified
+ *   MAC address value is used.
+ *
+ * - VNIC_MAC_ADDR_TYPE_PRIMARY: this is a VNIC based VLAN. The
+ *   address for this is the address of the primary MAC client.
+ *
  */
 
 typedef enum {
-	VNIC_MAC_ADDR_TYPE_FIXED
+	VNIC_MAC_ADDR_TYPE_FIXED,
+	VNIC_MAC_ADDR_TYPE_RANDOM,
+	VNIC_MAC_ADDR_TYPE_FACTORY,
+	VNIC_MAC_ADDR_TYPE_AUTO,
+	VNIC_MAC_ADDR_TYPE_PRIMARY
 } vnic_mac_addr_type_t;
 
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
 #define	VNIC_IOC_CREATE		VNICIOC(1)
 
+#define	VNIC_IOC_CREATE_NODUPCHECK		0x00000001
+#define	VNIC_IOC_CREATE_ANCHOR			0x00000002
+
+/*
+ * Force creation of VLAN based VNIC without checking if the
+ * undelying MAC supports the margin size.
+ */
+#define	VNIC_IOC_CREATE_FORCE			0x00000004
+
+/* Allocate a hardware ring to the vnic */
+#define	VNIC_IOC_CREATE_REQ_HWRINGS		0x00000008
+
 typedef struct vnic_ioc_create {
 	datalink_id_t	vc_vnic_id;
 	datalink_id_t	vc_link_id;
-	uint_t		vc_mac_len;
 	vnic_mac_addr_type_t vc_mac_addr_type;
+	uint_t		vc_mac_len;
 	uchar_t		vc_mac_addr[MAXMACADDRLEN];
+	uint_t		vc_mac_prefix_len;
+	int		vc_mac_slot;
+	uint16_t	vc_vid;
+	uint_t		vc_status;
+	uint_t		vc_flags;
+	vnic_ioc_diag_t	vc_diag;
+	mac_resource_props_t vc_resource_props;
 } vnic_ioc_create_t;
 
 #define	VNIC_IOC_DELETE		VNICIOC(2)
@@ -69,33 +135,43 @@ typedef struct vnic_ioc_delete {
 
 #define	VNIC_IOC_INFO		VNICIOC(3)
 
-typedef struct vnic_ioc_info_vnic {
+typedef struct vnic_info {
 	datalink_id_t	vn_vnic_id;
 	datalink_id_t	vn_link_id;
-	uint32_t	vn_mac_len;
-	uchar_t		vn_mac_addr[MAXMACADDRLEN];
 	vnic_mac_addr_type_t vn_mac_addr_type;
-} vnic_ioc_info_vnic_t;
+	uint_t		vn_mac_len;
+	uchar_t		vn_mac_addr[MAXMACADDRLEN];
+	uint_t		vn_mac_slot;
+	uint32_t	vn_mac_prefix_len;
+	uint16_t	vn_vid;
+	boolean_t	vn_force;
+	mac_resource_props_t vn_resource_props;
+} vnic_info_t;
 
 typedef struct vnic_ioc_info {
-	uint_t		vi_nvnics;
-	uint_t		vi_size;
-	datalink_id_t	vi_vnic_id;	/* DATALINK_ALL_LINKID returns all */
-	datalink_id_t	vi_linkid;
+	vnic_info_t	vi_info;
 } vnic_ioc_info_t;
 
 #define	VNIC_IOC_MODIFY		VNICIOC(4)
 
 #define	VNIC_IOC_MODIFY_ADDR		0x01
+#define	VNIC_IOC_MODIFY_RESOURCE_CTL	0x02
 
 typedef struct vnic_ioc_modify {
 	datalink_id_t	vm_vnic_id;
 	uint_t		vm_modify_mask;
+	uint_t		vm_mac_len;
+	int		vm_mac_slot;
 	uchar_t		vm_mac_addr[MAXMACADDRLEN];
 	vnic_mac_addr_type_t vm_mac_addr_type;
-	uint_t		vm_mac_len;
+	mac_resource_props_t vm_resource_props;
+	vnic_ioc_diag_t	vm_diag;
 } vnic_ioc_modify_t;
 
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h
index 6cb64523a8..b5dd59eea3 100644
--- a/usr/src/uts/common/sys/vnic_impl.h
+++ b/usr/src/uts/common/sys/vnic_impl.h
@@ -26,96 +26,40 @@
 #ifndef	_SYS_VNIC_IMPL_H
 #define	_SYS_VNIC_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
 #include <sys/vnic.h>
+#include <sys/mac_flow.h>
 #include <sys/ksynch.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
-typedef void (*vnic_rx_fn_t)(void *, void *, mblk_t *);
-
-typedef struct vnic_flow_fn_info_s {
-	vnic_rx_fn_t	ff_fn;
-	void		*ff_arg1;
-	void		*ff_arg2;
-} vnic_flow_fn_info_t;
-
-typedef struct vnic_flow_s {
-	uchar_t			vf_addr[MAXMACADDRLEN];
-	uint_t			vf_addr_len;
-	vnic_flow_fn_info_t	vf_fn_info;
-	void			*vf_cookie;
-	struct vnic_flow_s	*vf_next;
-	kmutex_t		vf_lock;
-	kcondvar_t		vf_cv;
-	uint32_t		vf_refs;
-	boolean_t		vf_clearing;
-	boolean_t		vf_is_active;
-} vnic_flow_t;
-
-typedef struct vnic_flow_tab_s {
-	vnic_flow_t		*vt_flow_list;
-	krwlock_t		vt_lock;
-	uint_t			vt_addr_len;
-} vnic_flow_tab_t;
-
-typedef struct vnic_mac_s {
-	mac_handle_t		va_mh;
-	uint_t			va_refs;
-	datalink_id_t		va_linkid;
-	const mac_txinfo_t	*va_txinfo;
-	struct vnic_bcast_grp_s	*va_bcast_grp;
-	krwlock_t		va_bcast_grp_lock;
-	size_t			va_addr_len;
-	mac_notify_handle_t	va_notify_hdl;
-	mac_rx_handle_t		va_rx_hdl;
-	vnic_flow_t		*va_active_flow;
-	vnic_flow_tab_t		*va_flow_tab;
-	boolean_t		va_mac_set;
-	struct vnic_s		*va_promisc;
-	krwlock_t		va_promisc_lock;
-	uint64_t		va_promisc_gen;
-} vnic_mac_t;
-
 typedef struct vnic_s {
-	datalink_id_t	vn_id;
+	datalink_id_t		vn_id;
 	uint32_t
-		vn_started : 1,
-		vn_promisc : 1,
-		vn_bcast_grp : 1,
-		vn_multi_mac : 1,
-		vn_promisc_mac : 1,
-		vn_pad_to_bit_31 : 27;
-
-	int		vn_slot_id;
-	multiaddress_capab_t	vn_mma_capab;
-	uint8_t		vn_addr[ETHERADDRL];
-	vnic_mac_addr_type_t vn_addr_type;
-
-	mac_handle_t	vn_mh;
-	uint32_t	vn_margin;
-	vnic_mac_t	*vn_vnic_mac;
-	vnic_flow_t	*vn_flow_ent;
-	uint32_t	vn_hcksum_txflags;
-	struct vnic_s	*vn_promisc_next;
-
-	uint64_t	vn_stat_multircv;
-	uint64_t	vn_stat_brdcstrcv;
-	uint64_t	vn_stat_multixmt;
-	uint64_t	vn_stat_brdcstxmt;
-	uint64_t	vn_stat_ierrors;
-	uint64_t	vn_stat_oerrors;
-	uint64_t	vn_stat_rbytes;
-	uint64_t	vn_stat_ipackets;
-	uint64_t	vn_stat_obytes;
-	uint64_t	vn_stat_opackets;
+				vn_started : 1,
+				vn_pad_to_bit_31 : 31;
+
+	mac_handle_t		vn_mh;
+	mac_handle_t		vn_lower_mh;
+	mac_client_handle_t	vn_mch;
+	mac_unicast_handle_t	vn_muh;
+	uint32_t		vn_margin;
+	int			vn_slot_id;
+	vnic_mac_addr_type_t	vn_addr_type;
+	uint8_t			vn_addr[MAXMACADDRLEN];
+	size_t			vn_addr_len;
+	uint16_t		vn_vid;
+	boolean_t		vn_force;
+	datalink_id_t		vn_link_id;
+	mac_notify_handle_t	vn_mnh;
+
+	uint32_t		vn_hcksum_txflags;
 } vnic_t;
 
-#define	vn_txinfo	vn_vnic_mac->va_txinfo
-
 #define	vn_madd_naddr		vn_mma_capab.maddr_naddr
 #define	vn_maddr_naddrfree	vn_mma_capab.maddr_naddrfree
 #define	vn_maddr_flag		vn_mma_capab.maddr_flag
@@ -126,68 +70,19 @@ typedef struct vnic_s {
 #define	vn_maddr_modify		vn_mma_capab.maddr_modify
 #define	vn_maddr_get		vn_mma_capab.maddr_get
 
-#define	VNIC_FLOW_REFHOLD(flow) {				\
-	mutex_enter(&(flow)->vf_lock);				\
-	(flow)->vf_refs++;					\
-	mutex_exit(&(flow)->vf_lock);				\
-}
-
-#define	VNIC_FLOW_REFRELE(flow) {				\
-	mutex_enter(&(flow)->vf_lock);				\
-	if (--(flow)->vf_refs == 0 && (flow)->vf_clearing) {	\
-	    (flow)->vf_clearing = B_FALSE;			\
-	    cv_signal(&(flow)->vf_cv);				\
-	}							\
-	mutex_exit(&(flow)->vf_lock);				\
-}
-
-extern int vnic_dev_create(datalink_id_t, datalink_id_t, int, uchar_t *);
+extern int vnic_dev_create(datalink_id_t, datalink_id_t, vnic_mac_addr_type_t *,
+    int *, uchar_t *, int *, uint_t, uint16_t, mac_resource_props_t *,
+    uint32_t, vnic_ioc_diag_t *);
 extern int vnic_dev_modify(datalink_id_t, uint_t, vnic_mac_addr_type_t,
-    uint_t, uchar_t *);
-extern int vnic_dev_delete(datalink_id_t);
-
-typedef int (*vnic_info_new_vnic_fn_t)(void *, datalink_id_t,
-    vnic_mac_addr_type_t, uint_t, uint8_t *, datalink_id_t);
+    uint_t, uchar_t *, uint_t, mac_resource_props_t *);
+extern int vnic_dev_delete(datalink_id_t, uint32_t);
 
 extern void vnic_dev_init(void);
 extern void vnic_dev_fini(void);
 extern uint_t vnic_dev_count(void);
 extern dev_info_t *vnic_get_dip(void);
 
-extern int vnic_info(uint_t *, datalink_id_t, datalink_id_t, void *,
-    vnic_info_new_vnic_fn_t);
-
-extern void vnic_rx(void *, void *, mblk_t *);
-extern mblk_t *vnic_fix_cksum(mblk_t *);
-extern mblk_t *vnic_copymsgchain_cksum(mblk_t *);
-extern mblk_t *vnic_copymsg_cksum(mblk_t *);
-
-extern void vnic_promisc_rx(vnic_mac_t *, vnic_t *, mblk_t *);
-
-extern void vnic_bcast_init(void);
-extern void vnic_bcast_fini(void);
-extern int vnic_bcast_add(vnic_t *, const uint8_t *, mac_addrtype_t);
-extern void vnic_bcast_delete(vnic_t *, const uint8_t *);
-extern void vnic_bcast_send(void *, void *, mblk_t *);
-
-extern void vnic_classifier_init(void);
-extern void vnic_classifier_fini(void);
-extern vnic_flow_t *vnic_classifier_flow_create(uint_t, uchar_t *, void *,
-    boolean_t, int);
-extern void vnic_classifier_flow_destroy(vnic_flow_t *);
-extern void vnic_classifier_flow_add(vnic_mac_t *, vnic_flow_t *, vnic_rx_fn_t,
-    void *, void *);
-extern void vnic_classifier_flow_remove(vnic_mac_t *, vnic_flow_t *);
-extern void vnic_classifier_flow_update_addr(vnic_flow_t *, uchar_t *);
-extern void vnic_classifier_flow_update_fn(vnic_flow_t *, vnic_rx_fn_t,
-    void *, void *);
-extern int vnic_classifier_flow_tab_init(vnic_mac_t *, uint_t, int);
-extern void vnic_classifier_flow_tab_fini(vnic_mac_t *);
-extern vnic_flow_t *vnic_classifier_get_flow(vnic_mac_t *, mblk_t *);
-extern void *vnic_classifier_get_client_cookie(vnic_flow_t *);
-extern vnic_flow_fn_info_t *vnic_classifier_get_fn_info(vnic_flow_t *);
-extern boolean_t vnic_classifier_is_active(vnic_flow_t *);
-
+extern int vnic_info(vnic_info_t *);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/syscall/acctctl.c b/usr/src/uts/common/syscall/acctctl.c
index 4fb322a211..ce325109be 100644
--- a/usr/src/uts/common/syscall/acctctl.c
+++ b/usr/src/uts/common/syscall/acctctl.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/param.h>
@@ -115,6 +113,7 @@ ac_file_in_use(vnode_t *vp)
 		mutex_enter(&acg->ac_proc.ac_lock);
 		mutex_enter(&acg->ac_task.ac_lock);
 		mutex_enter(&acg->ac_flow.ac_lock);
+		mutex_enter(&acg->ac_net.ac_lock);
 	}
 
 	for (acg = list_head(&exacct_globals_list); !in_use && acg != NULL;
@@ -125,7 +124,8 @@ ac_file_in_use(vnode_t *vp)
 		 */
 		if (vn_compare(acg->ac_proc.ac_vnode, vp) ||
 		    vn_compare(acg->ac_task.ac_vnode, vp) ||
-		    vn_compare(acg->ac_flow.ac_vnode, vp))
+		    vn_compare(acg->ac_flow.ac_vnode, vp) ||
+		    vn_compare(acg->ac_net.ac_vnode, vp))
 			in_use = B_TRUE;
 	}
 
@@ -137,6 +137,7 @@ ac_file_in_use(vnode_t *vp)
 		mutex_exit(&acg->ac_proc.ac_lock);
 		mutex_exit(&acg->ac_task.ac_lock);
 		mutex_exit(&acg->ac_flow.ac_lock);
+		mutex_exit(&acg->ac_net.ac_lock);
 	}
 	mutex_exit(&exacct_globals_list_lock);
 	return (in_use);
@@ -449,17 +450,21 @@ acctctl(int cmd, void *buf, size_t bufsz)
 		info = &acg->ac_proc;
 		maxres = AC_PROC_MAX_RES;
 		break;
+	/*
+	 * Flow/net accounting isn't configurable in non-global
+	 * zones, but we have this field on a per-zone basis for future
+	 * expansion as well as the ability to return default "unset"
+	 * values for the various AC_*_GET queries.  AC_*_SET commands
+	 * fail with EPERM for AC_FLOW and AC_NET in non-global zones.
+	 */
 	case AC_FLOW:
-		/*
-		 * Flow accounting isn't currently configurable in non-global
-		 * zones, but we have this field on a per-zone basis for future
-		 * expansion as well as the ability to return default "unset"
-		 * values for the various AC_*_GET queries.  AC_*_SET commands
-		 * fail with EPERM for AC_FLOW in non-global zones.
-		 */
 		info = &acg->ac_flow;
 		maxres = AC_FLOW_MAX_RES;
 		break;
+	case AC_NET:
+		info = &acg->ac_net;
+		maxres = AC_NET_MAX_RES;
+		break;
 	default:
 		return (set_errno(EINVAL));
 	}
@@ -468,7 +473,8 @@ acctctl(int cmd, void *buf, size_t bufsz)
 	case AC_STATE_SET:
 		if ((error = secpolicy_acct(CRED())) != 0)
 			break;
-		if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) {
+		if ((mode == AC_FLOW || mode == AC_NET) &&
+		    getzoneid() != GLOBAL_ZONEID) {
 			error = EPERM;
 			break;
 		}
@@ -480,7 +486,8 @@ acctctl(int cmd, void *buf, size_t bufsz)
 	case AC_FILE_SET:
 		if ((error = secpolicy_acct(CRED())) != 0)
 			break;
-		if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) {
+		if ((mode == AC_FLOW || mode == AC_NET) &&
+		    getzoneid() != GLOBAL_ZONEID) {
 			error = EPERM;
 			break;
 		}
@@ -492,7 +499,8 @@ acctctl(int cmd, void *buf, size_t bufsz)
 	case AC_RES_SET:
 		if ((error = secpolicy_acct(CRED())) != 0)
 			break;
-		if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) {
+		if ((mode == AC_FLOW || mode == AC_NET) &&
+		    getzoneid() != GLOBAL_ZONEID) {
 			error = EPERM;
 			break;
 		}
@@ -580,6 +588,7 @@ exacct_zone_shutdown(zoneid_t zoneid, void *data)
 	exacct_free_info(&acg->ac_proc);
 	exacct_free_info(&acg->ac_task);
 	exacct_free_info(&acg->ac_flow);
+	exacct_free_info(&acg->ac_net);
 }
 
 /* ARGSUSED */
@@ -595,6 +604,7 @@ exacct_zone_fini(zoneid_t zoneid, void *data)
 	mutex_destroy(&acg->ac_proc.ac_lock);
 	mutex_destroy(&acg->ac_task.ac_lock);
 	mutex_destroy(&acg->ac_flow.ac_lock);
+	mutex_destroy(&acg->ac_net.ac_lock);
 	kmem_free(acg, sizeof (*acg));
 }
 
diff --git a/usr/src/uts/common/xen/io/xnb.c b/usr/src/uts/common/xen/io/xnb.c
index 6ac3e6e6ab..308f3c60ff 100644
--- a/usr/src/uts/common/xen/io/xnb.c
+++ b/usr/src/uts/common/xen/io/xnb.c
@@ -35,6 +35,7 @@
 #include <sys/modctl.h>
 #include <sys/conf.h>
 #include <sys/mac.h>
+#include <sys/mac_impl.h> /* XXXXBOW - remove, included for mac_fix_cksum() */
 #include <sys/dlpi.h>
 #include <sys/strsubr.h>
 #include <sys/strsun.h>
@@ -247,7 +248,7 @@ xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
 	    HCK_FULLCKSUM, KM_NOSLEEP);
 
-	return (vnic_fix_cksum(mp));
+	return (mac_fix_cksum(mp));
 }
 
 mblk_t *
diff --git a/usr/src/uts/common/xen/io/xnbo.c b/usr/src/uts/common/xen/io/xnbo.c
index 790e850289..79831ee7f1 100644
--- a/usr/src/uts/common/xen/io/xnbo.c
+++ b/usr/src/uts/common/xen/io/xnbo.c
@@ -34,8 +34,12 @@
 #include "xnb.h"
 
 #include <sys/sunddi.h>
+#include <sys/ddi.h>
 #include <sys/modctl.h>
 #include <sys/strsubr.h>
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
 #include <sys/mac.h>
 #include <net/if.h>
 #include <sys/dlpi.h>
@@ -45,9 +49,9 @@
 
 typedef struct xnbo {
 	mac_handle_t		o_mh;
-	mac_rx_handle_t		o_mrh;
-	const mac_txinfo_t	*o_mtx;
-	mac_notify_handle_t	o_mnh;
+	mac_client_handle_t	o_mch;
+	mac_unicast_handle_t	o_mah;
+	mac_promisc_handle_t	o_mphp;
 	boolean_t		o_running;
 	boolean_t		o_promiscuous;
 	uint32_t		o_hcksum_capab;
@@ -70,11 +74,9 @@ xnbo_to_mac(xnb_t *xnbp, mblk_t *mp)
 		goto fail;
 	}
 
-	mp = xnbop->o_mtx->mt_fn(xnbop->o_mtx->mt_arg, mp);
-
-	if (mp != NULL) {
+	if (mac_tx(xnbop->o_mch, mp, 0,
+	    MAC_DROP_ON_NO_DESC, NULL) != NULL) {
 		xnbp->xnb_stat_mac_full++;
-		goto fail;
 	}
 
 	return;
@@ -156,7 +158,8 @@ xnbo_cksum_to_peer(xnb_t *xnbp, mblk_t *mp)
  */
 /*ARGSUSED*/
 static void
-xnbo_from_mac(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+xnbo_from_mac(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
 {
 	xnb_t *xnbp = arg;
 
@@ -173,7 +176,8 @@ xnbo_from_mac(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
  */
 /*ARGSUSED*/
 static void
-xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
 {
 	xnb_t *xnbp = arg;
 	xnbo_t *xnbop = xnbp->xnb_flavour_data;
@@ -216,25 +220,12 @@ xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
 #undef	ADD
 
 	if (keep_head != NULL)
-		xnbo_from_mac(xnbp, mrh, keep_head);
+		xnbo_from_mac(xnbp, mrh, keep_head, B_FALSE);
 
 	if (free_head != NULL)
 		freemsgchain(free_head);
 }
 
-static void
-xnbo_notify(void *arg, mac_notify_type_t type)
-{
-	xnb_t *xnbp = arg;
-	xnbo_t *xnbop = xnbp->xnb_flavour_data;
-
-	switch (type) {
-	case MAC_NOTE_PROMISC:
-		xnbop->o_mtx = mac_tx_get(xnbop->o_mh);
-		break;
-	}
-}
-
 static boolean_t
 xnbo_open_mac(xnb_t *xnbp, char *mac)
 {
@@ -242,8 +233,10 @@ xnbo_open_mac(xnb_t *xnbp, char *mac)
 	int err, need_rx_filter, need_setphysaddr, need_promiscuous;
 	const mac_info_t *mi;
 	char *xsname;
-	void (*rx_fn)(void *, mac_resource_handle_t, mblk_t *);
+	void (*rx_fn)(void *, mac_resource_handle_t, mblk_t *, boolean_t);
+	struct ether_addr ea;
 	uint_t max_sdu;
+	mac_diag_t diag;
 
 	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
 
@@ -279,8 +272,22 @@ xnbo_open_mac(xnb_t *xnbp, char *mac)
 		return (B_FALSE);
 	}
 
-	xnbop->o_mnh = mac_notify_add(xnbop->o_mh, xnbo_notify, xnbp);
-	ASSERT(xnbop->o_mnh != NULL);
+	if (mac_client_open(xnbop->o_mh, &xnbop->o_mch, NULL,
+	    MAC_OPEN_FLAGS_USE_DATALINK_NAME) != 0) {
+		cmn_err(CE_WARN, "xnbo_open_mac: "
+		    "error (%d) opening mac client", err);
+		xnbo_close_mac(xnbop);
+		return (B_FALSE);
+	}
+
+	err = mac_unicast_primary_add(xnbop->o_mch, &xnbop->o_mah, &diag);
+	if (err != 0) {
+		cmn_err(CE_WARN, "xnbo_open_mac: "
+		    "failed to get the primary MAC address of "
+		    "%s: %d", mac, err);
+		xnbo_close_mac(xnbop);
+		return (B_FALSE);
+	}
 
 	/*
 	 * Should the receive path filter packets from the downstream
@@ -294,11 +301,27 @@ xnbo_open_mac(xnb_t *xnbp, char *mac)
 	else
 		rx_fn = xnbo_from_mac;
 
-	xnbop->o_mrh = mac_rx_add(xnbop->o_mh, rx_fn, xnbp);
-	ASSERT(xnbop->o_mrh != NULL);
-
-	xnbop->o_mtx = mac_tx_get(xnbop->o_mh);
-	ASSERT(xnbop->o_mtx != NULL);
+	/*
+	 * Should we set the underlying NIC into promiscuous mode? The
+	 * default is "no".
+	 */
+	if (xenbus_scanf(XBT_NULL, xsname,
+	    "SUNW-need-promiscuous", "%d", &need_promiscuous) != 0)
+		need_promiscuous = 0;
+	if (need_promiscuous == 0) {
+		mac_rx_set(xnbop->o_mch, rx_fn, xnbp);
+	} else {
+		err = mac_promisc_add(xnbop->o_mch, MAC_CLIENT_PROMISC_ALL,
+		    rx_fn, xnbp, &xnbop->o_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP);
+		if (err != 0) {
+			cmn_err(CE_WARN, "xnbo_open_mac: "
+			    "cannot enable promiscuous mode of %s: %d",
+			    mac, err);
+			xnbo_close_mac(xnbop);
+			return (B_FALSE);
+		}
+		xnbop->o_promiscuous = B_TRUE;
+	}
 
 	if (!mac_capab_get(xnbop->o_mh, MAC_CAPAB_HCKSUM,
 	    &xnbop->o_hcksum_capab))
@@ -312,45 +335,17 @@ xnbo_open_mac(xnb_t *xnbp, char *mac)
 	    "SUNW-need-set-physaddr", "%d", &need_setphysaddr) != 0)
 		need_setphysaddr = 0;
 	if (need_setphysaddr > 0) {
-		struct ether_addr ea;
-
-		err = mac_unicst_set(xnbop->o_mh, xnbp->xnb_mac_addr);
+		err = mac_unicast_primary_set(xnbop->o_mh, xnbp->xnb_mac_addr);
 		/* Warn, but continue on. */
 		if (err != 0) {
 			bcopy(xnbp->xnb_mac_addr, ea.ether_addr_octet,
 			    ETHERADDRL);
 			cmn_err(CE_WARN, "xnbo_open_mac: "
 			    "cannot set MAC address of %s to "
-			    "%s: %d", mac, ether_sprintf(&ea),
-			    err);
-		}
-	}
-
-	/*
-	 * Should we set the underlying NIC into promiscuous mode? The
-	 * default is "no".
-	 */
-	if (xenbus_scanf(XBT_NULL, xsname,
-	    "SUNW-need-promiscuous", "%d", &need_promiscuous) != 0)
-		need_promiscuous = 0;
-	if (need_promiscuous > 0) {
-		err = mac_promisc_set(xnbop->o_mh, B_TRUE, MAC_DEVPROMISC);
-		if (err != 0) {
-			cmn_err(CE_WARN, "xnbo_open_mac: "
-			    "cannot enable promiscuous mode of %s: %d",
-			    mac, err);
-			xnbo_close_mac(xnbop);
-			return (B_FALSE);
+			    "%s: %d", mac, ether_sprintf(&ea), err);
 		}
-		xnbop->o_promiscuous = B_TRUE;
 	}
 
-	if ((err = mac_start(xnbop->o_mh)) != 0) {
-		cmn_err(CE_WARN, "xnbo_open_mac: "
-		    "cannot start mac device (%d)", err);
-		xnbo_close_mac(xnbop);
-		return (B_FALSE);
-	}
 	xnbop->o_running = B_TRUE;
 
 	return (B_TRUE);
@@ -385,26 +380,24 @@ xnbo_close_mac(xnbo_t *xnbop)
 		return;
 
 	if (xnbop->o_running) {
-		mac_stop(xnbop->o_mh);
 		xnbop->o_running = B_FALSE;
 	}
 
 	if (xnbop->o_promiscuous) {
-		(void) mac_promisc_set(xnbop->o_mh, B_FALSE,
-		    MAC_DEVPROMISC);
+		(void) mac_promisc_remove(xnbop->o_mphp);
 		xnbop->o_promiscuous = B_FALSE;
+	} else {
+		mac_rx_clear(xnbop->o_mch);
 	}
 
-	xnbop->o_mtx = NULL;
-
-	if (xnbop->o_mrh != NULL) {
-		mac_rx_remove(xnbop->o_mh, xnbop->o_mrh, B_TRUE);
-		xnbop->o_mrh = NULL;
+	if (xnbop->o_mah != NULL) {
+		(void) mac_unicast_remove(xnbop->o_mch, xnbop->o_mah);
+		xnbop->o_mah = NULL;
 	}
 
-	if (xnbop->o_mnh != NULL) {
-		mac_notify_remove(xnbop->o_mh, xnbop->o_mnh);
-		xnbop->o_mnh = NULL;
+	if (xnbop->o_mch != NULL) {
+		mac_client_close(xnbop->o_mch, 0);
+		xnbop->o_mch = NULL;
 	}
 
 	mac_close(xnbop->o_mh);
@@ -453,8 +446,9 @@ xnbo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	xnbop = kmem_zalloc(sizeof (*xnbop), KM_SLEEP);
 
 	xnbop->o_mh = NULL;
-	xnbop->o_mrh = NULL;
-	xnbop->o_mtx = NULL;
+	xnbop->o_mch = NULL;
+	xnbop->o_mah = NULL;
+	xnbop->o_mphp = NULL;
 	xnbop->o_running = B_FALSE;
 	xnbop->o_hcksum_capab = 0;
 
diff --git a/usr/src/uts/common/xen/io/xnbu.c b/usr/src/uts/common/xen/io/xnbu.c
index f5c0ba9809..80e2378608 100644
--- a/usr/src/uts/common/xen/io/xnbu.c
+++ b/usr/src/uts/common/xen/io/xnbu.c
@@ -40,7 +40,7 @@
 #include <sys/strsubr.h>
 #include <sys/dlpi.h>
 #include <sys/pattr.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <xen/sys/xendev.h>
 
@@ -51,19 +51,16 @@ static int	xnbu_m_set_mac_addr(void *, const uint8_t *);
 static int	xnbu_m_set_multicast(void *, boolean_t, const uint8_t *);
 static int	xnbu_m_set_promiscuous(void *, boolean_t);
 static int	xnbu_m_stat(void *, uint_t, uint64_t *);
-static void	xnbu_m_blank(void *, time_t, uint_t);
-static void	xnbu_m_resources(void *);
 static boolean_t xnbu_m_getcapab(void *, mac_capab_t, void *);
 static mblk_t	*xnbu_m_send(void *, mblk_t *);
 
 typedef struct xnbu {
 	mac_handle_t		u_mh;
-	mac_resource_handle_t	u_mrh;
 	boolean_t		u_need_sched;
 } xnbu_t;
 
 static mac_callbacks_t xnb_callbacks = {
-	MC_RESOURCES | MC_GETCAPAB,
+	MC_GETCAPAB,
 	xnbu_m_stat,
 	xnbu_m_start,
 	xnbu_m_stop,
@@ -71,7 +68,6 @@ static mac_callbacks_t xnb_callbacks = {
 	xnbu_m_set_multicast,
 	xnbu_m_set_mac_addr,
 	xnbu_m_send,
-	xnbu_m_resources,
 	NULL,
 	xnbu_m_getcapab
 };
@@ -84,7 +80,7 @@ xnbu_to_host(xnb_t *xnbp, mblk_t *mp)
 
 	ASSERT(mp != NULL);
 
-	mac_rx(xnbup->u_mh, xnbup->u_mrh, mp);
+	mac_rx(xnbup->u_mh, NULL, mp);
 
 	mutex_enter(&xnbp->xnb_rx_lock);
 
@@ -328,32 +324,6 @@ xnbu_m_stat(void *arg, uint_t stat, uint64_t *val)
 	return (0);
 }
 
-/*ARGSUSED*/
-static void
-xnbu_m_blank(void *arg, time_t ticks, uint_t count)
-{
-	/*
-	 * XXPV dme: blanking is not currently implemented.
-	 */
-}
-
-static void
-xnbu_m_resources(void *arg)
-{
-	xnb_t *xnbp = arg;
-	xnbu_t *xnbup = xnbp->xnb_flavour_data;
-	mac_rx_fifo_t mrf;
-
-	mrf.mrf_type = MAC_RX_FIFO;
-	mrf.mrf_blank = xnbu_m_blank;
-	mrf.mrf_arg = (void *)xnbp;
-	mrf.mrf_normal_blank_time = 128; /* XXPV dme: see xnbu_m_blank() */
-	mrf.mrf_normal_pkt_count = 8;    /* XXPV dme: see xnbu_m_blank() */
-
-	xnbup->u_mrh = mac_resource_add(xnbup->u_mh,
-	    (mac_resource_t *)&mrf);
-}
-
 static boolean_t
 xnbu_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 {
@@ -369,11 +339,6 @@ xnbu_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 			*capab = 0;
 		break;
 	}
-
-	case MAC_CAPAB_POLL:
-		/* Just return B_TRUE. */
-		break;
-
 	default:
 		return (B_FALSE);
 	}
diff --git a/usr/src/uts/common/xen/io/xnf.c b/usr/src/uts/common/xen/io/xnf.c
index c14c651c61..0813d6cbe1 100644
--- a/usr/src/uts/common/xen/io/xnf.c
+++ b/usr/src/uts/common/xen/io/xnf.c
@@ -80,7 +80,7 @@
 #include <inet/ip_impl.h>
 #include <sys/gld.h>
 #include <sys/modctl.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/bootinfo.h>
 #include <sys/mach_mmu.h>
@@ -148,8 +148,6 @@ static int	xnf_set_promiscuous(void *, boolean_t);
 static mblk_t	*xnf_send(void *, mblk_t *);
 static uint_t	xnf_intr(caddr_t);
 static int	xnf_stat(void *, uint_t, uint64_t *);
-static void	xnf_blank(void *, time_t, uint_t);
-static void	xnf_resources(void *);
 static void	xnf_ioctl(void *, queue_t *, mblk_t *);
 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
 
@@ -178,7 +176,7 @@ static boolean_t xnf_kstat_init(xnf_t *xnfp);
  * XXPV dme: remove MC_IOCTL?
  */
 static mac_callbacks_t xnf_callbacks = {
-	MC_RESOURCES | MC_IOCTL | MC_GETCAPAB,
+	MC_IOCTL | MC_GETCAPAB,
 	xnf_stat,
 	xnf_start,
 	xnf_stop,
@@ -186,7 +184,6 @@ static mac_callbacks_t xnf_callbacks = {
 	xnf_set_multicast,
 	xnf_set_mac_addr,
 	xnf_send,
-	xnf_resources,
 	xnf_ioctl,
 	xnf_getcapab
 };
@@ -1436,7 +1433,7 @@ xnf_intr(caddr_t arg)
 			mp = xnf_process_recv(xnfp);
 
 		if (mp != NULL)
-			mac_rx(xnfp->xnf_mh, xnfp->xnf_rx_handle, mp);
+			mac_rx(xnfp->xnf_mh, NULL, mp);
 	}
 
 	xnfp->xnf_stat_interrupts++;
@@ -2518,39 +2515,6 @@ xnf_stat(void *arg, uint_t stat, uint64_t *val)
 
 /*ARGSUSED*/
 static void
-xnf_blank(void *arg, time_t ticks, uint_t count)
-{
-	/*
-	 * XXPV dme: blanking is not currently implemented.
-	 *
-	 * It's not obvious how to use the 'ticks' argument here.
-	 *
-	 * 'Count' might be used as an indicator of how to set
-	 * rsp_event when posting receive buffers to the rx_ring.  It
-	 * would replace the code at the tail of xnf_process_recv()
-	 * that simply indicates that the next completed packet should
-	 * cause an interrupt.
-	 */
-}
-
-static void
-xnf_resources(void *arg)
-{
-	xnf_t *xnfp = arg;
-	mac_rx_fifo_t mrf;
-
-	mrf.mrf_type = MAC_RX_FIFO;
-	mrf.mrf_blank = xnf_blank;
-	mrf.mrf_arg = (void *)xnfp;
-	mrf.mrf_normal_blank_time = 128;	/* XXPV dme: see xnf_blank() */
-	mrf.mrf_normal_pkt_count = 8;		/* XXPV dme: see xnf_blank() */
-
-	xnfp->xnf_rx_handle = mac_resource_add(xnfp->xnf_mh,
-	    (mac_resource_t *)&mrf);
-}
-
-/*ARGSUSED*/
-static void
 xnf_ioctl(void *arg, queue_t *q, mblk_t *mp)
 {
 	miocnak(q, mp, 0, EINVAL);
@@ -2588,11 +2552,6 @@ xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 			*capab = 0;
 		break;
 	}
-
-	case MAC_CAPAB_POLL:
-		/* Just return B_TRUE. */
-		break;
-
 	default:
 		return (B_FALSE);
 	}
diff --git a/usr/src/uts/common/xen/io/xnf.h b/usr/src/uts/common/xen/io/xnf.h
index d8edf89f86..9b0cc4c357 100644
--- a/usr/src/uts/common/xen/io/xnf.h
+++ b/usr/src/uts/common/xen/io/xnf.h
@@ -135,7 +135,6 @@ typedef struct xnf {
 	struct tx_pktinfo	xnf_tx_pkt_info[NET_TX_RING_SIZE];
 	struct xnf_buffer_desc	*xnf_rxpkt_bufptr[XNF_MAX_RXDESCS];
 
-	mac_resource_handle_t	xnf_rx_handle;
 	ddi_iblock_cookie_t	xnf_icookie;
 	kmutex_t		xnf_tx_buf_mutex;
 	kmutex_t		xnf_rx_buf_mutex;
diff --git a/usr/src/uts/i86xpv/xnb/Makefile b/usr/src/uts/i86xpv/xnb/Makefile
index 4fa08e3f70..dc7503a46e 100644
--- a/usr/src/uts/i86xpv/xnb/Makefile
+++ b/usr/src/uts/i86xpv/xnb/Makefile
@@ -20,10 +20,9 @@
 #
 
 #
-# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
 #
 #	This makefile drives the production of the xnb
 #	network driver support module.
@@ -59,7 +58,7 @@ INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
 #
 # Module depends on VNIC.
 #
-LDFLAGS		+= -dy -N drv/vnic
+LDFLAGS		+= -dy -N drv/vnic -N misc/mac
 
 #
 # use Solaris specific code in xen public header files
diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s
index ee03e0967f..e29afc6c29 100644
--- a/usr/src/uts/intel/ia32/ml/modstubs.s
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s
@@ -1245,6 +1245,8 @@ fcnname/**/_info:							\
 	STUB(dld, dld_init_ops, nomod_void);
 	STUB(dld, dld_fini_ops, nomod_void);
 	STUB(dld, dld_autopush, nomod_minus_one);
+	STUB(dld, dld_ioc_register, nomod_einval);
+	STUB(dld, dld_ioc_unregister, nomod_void);
 	END_MODULE(dld);
 #endif
 
@@ -1255,12 +1257,15 @@ fcnname/**/_info:							\
  */
 #ifndef DLS_MODULE
 	MODULE(dls,misc);
-	STUB(dls, dls_devnet_vid, nomod_zero);
 	STUB(dls, dls_devnet_mac, nomod_zero);
 	STUB(dls, dls_devnet_hold_tmp, nomod_einval);
 	STUB(dls, dls_devnet_rele_tmp, nomod_void);
+	STUB(dls, dls_devnet_hold_link, nomod_einval);
+	STUB(dls, dls_devnet_rele_link, nomod_void);
 	STUB(dls, dls_devnet_prop_task_wait, nomod_void);
 	STUB(dls, dls_mgmt_get_linkid, nomod_einval);
+	STUB(dls, dls_devnet_macname2linkid, nomod_einval);
+	STUB(dls, dls_mgmt_get_linkinfo, nomod_einval);
         END_MODULE(dls);
 #endif
 
diff --git a/usr/src/uts/intel/io/amd8111s/amd8111s_main.c b/usr/src/uts/intel/io/amd8111s/amd8111s_main.c
index 6587531959..1664ee7543 100644
--- a/usr/src/uts/intel/io/amd8111s/amd8111s_main.c
+++ b/usr/src/uts/intel/io/amd8111s/amd8111s_main.c
@@ -76,7 +76,6 @@ static int amd8111s_detach(dev_info_t *, ddi_detach_cmd_t);
 static int amd8111s_m_unicst(void *, const uint8_t *);
 static int amd8111s_m_promisc(void *, boolean_t);
 static int amd8111s_m_stat(void *, uint_t, uint64_t *);
-static void amd8111s_m_resources(void *arg);
 static void amd8111s_m_ioctl(void *, queue_t *, mblk_t *);
 static int amd8111s_m_multicst(void *, boolean_t, const uint8_t *addr);
 static int amd8111s_m_start(void *);
@@ -186,11 +185,9 @@ static ddi_device_acc_attr_t pcn_acc_attr = {
 	DDI_STRICTORDER_ACC
 };
 
-#define	AMD8111S_M_CALLBACK_FLAGS	(MC_RESOURCES | MC_IOCTL)
-
 
 static mac_callbacks_t amd8111s_m_callbacks = {
-	AMD8111S_M_CALLBACK_FLAGS,
+	MC_IOCTL,
 	amd8111s_m_stat,
 	amd8111s_m_start,
 	amd8111s_m_stop,
@@ -198,7 +195,6 @@ static mac_callbacks_t amd8111s_m_callbacks = {
 	amd8111s_m_multicst,
 	amd8111s_m_unicst,
 	amd8111s_m_tx,
-	amd8111s_m_resources,
 	amd8111s_m_ioctl
 };
 
@@ -248,29 +244,6 @@ _fini()
 	return (status);
 }
 
-/* Adjust Interrupt Coalescing Register to coalesce interrupts */
-static void
-amd8111s_m_blank(void *arg, time_t ticks, uint32_t count)
-{
-	_NOTE(ARGUNUSED(arg, ticks, count));
-}
-
-static void
-amd8111s_m_resources(void *arg)
-{
-	struct LayerPointers *adapter = arg;
-	mac_rx_fifo_t mrf;
-
-	mrf.mrf_type = MAC_RX_FIFO;
-	mrf.mrf_blank = amd8111s_m_blank;
-	mrf.mrf_arg = (void *)adapter;
-	mrf.mrf_normal_blank_time = 128;
-	mrf.mrf_normal_pkt_count = 8;
-
-	adapter->pOdl->mrh = mac_resource_add(adapter->pOdl->mh,
-	    (mac_resource_t *)&mrf);
-}
-
 /*
  * Loopback Support
  */
@@ -665,7 +638,7 @@ amd8111s_receive(struct LayerPointers *pLayerPointers)
 	}
 
 	if (ret_mp) {
-		mac_rx(pOdl->mh, pOdl->mrh, ret_mp);
+		mac_rx(pOdl->mh, NULL, ret_mp);
 	}
 
 	(void) ddi_dma_sync(pOdl->rx_desc_dma_handle, 0, 0,
diff --git a/usr/src/uts/intel/io/amd8111s/amd8111s_main.h b/usr/src/uts/intel/io/amd8111s/amd8111s_main.h
index 922f5150c1..00f430273f 100755..100644
--- a/usr/src/uts/intel/io/amd8111s/amd8111s_main.h
+++ b/usr/src/uts/intel/io/amd8111s/amd8111s_main.h
@@ -1,13 +1,11 @@
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef AMD8111S_MAIN_H
 #define	AMD8111S_MAIN_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Copyright (c) 2001-2006 Advanced Micro Devices, Inc.  All rights reserved.
  *
@@ -55,10 +53,6 @@
  * nationals of countries subject to national security controls.
  */
 
-
-#pragma ident "@(#)$RCSfile: odl.h,v $ $Revision: 1.1 $  " \
-"$Date: 2004/04/22 15:22:52 $ AMD"
-
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/kmem.h>
@@ -79,7 +73,7 @@
 
 #include <sys/ethernet.h>
 #include <sys/dlpi.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/netlb.h>
 #include "amd8111s_hw.h"
@@ -278,7 +272,6 @@ struct odl {
 	dev_info_t *devinfo;
 
 	mac_handle_t mh;		/* mac module handle */
-	mac_resource_handle_t mrh;
 
 	struct amd8111s_statistics statistics;
 
diff --git a/usr/src/uts/intel/ip/Makefile b/usr/src/uts/intel/ip/Makefile
index c2e44f9934..6cd3d4ac5a 100644
--- a/usr/src/uts/intel/ip/Makefile
+++ b/usr/src/uts/intel/ip/Makefile
@@ -19,10 +19,9 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
 #
 #	This makefile drives the production of the ip driver 
 #	kernel module.
diff --git a/usr/src/uts/intel/ip/ip.global-objs.debug64 b/usr/src/uts/intel/ip/ip.global-objs.debug64
index 5854497325..f4bcb8ab0c 100644
--- a/usr/src/uts/intel/ip/ip.global-objs.debug64
+++ b/usr/src/uts/intel/ip/ip.global-objs.debug64
@@ -44,7 +44,6 @@ cl_sctp_disconnect
 cl_sctp_listen
 cl_sctp_unlisten
 conn_drain_nthreads
-crctab
 default_ip6_asp_table
 do_tcp_direct_sockfs
 do_tcp_fusion
@@ -105,7 +104,6 @@ ip_cgtp_filter_rev
 ip_conn_cache
 ip_debug
 ip_g_all_ones
-ip_input_proc
 ip_ioctl_ftbl
 ip_ire_cleanup_cnt
 ip_ire_cpu_ratio
@@ -133,15 +131,12 @@ ip_poll_normal_ms
 ip_poll_normal_ticks
 ip_rput_pullups
 ip_six_byte_all_ones
-ip_soft_rings_cnt
-ip_squeue_bind
 ip_squeue_create_callback
 ip_squeue_enter
 ip_squeue_enter_unbound
 ip_squeue_fanout
-ip_squeue_profile
+ip_squeue_flag
 ip_squeue_worker_wait
-ip_squeues_per_cpu
 ip_thread_data
 ip_thread_list
 ip_thread_rwlock
@@ -221,10 +216,6 @@ req_arr
 rn_mkfreelist
 rn_ones
 rn_zeros
-rr_max_blank_ratio
-rr_max_pkt_cnt_ratio
-rr_min_blank_ratio
-rr_min_pkt_cnt_ratio
 rt_entry_cache
 rts_conn_cache
 rts_g_t_info_ack
@@ -262,19 +253,12 @@ sin_null
 skip_sctp_cksum
 sqset_global_list
 sqset_global_size
+sqset_lock
 squeue_cache
-squeue_intrdrain_ms
-squeue_intrdrain_ns
-squeue_kstat
-squeue_kstat_lock
-squeue_profile
-squeue_worker_poll_min
-squeue_workerdrain_ms
-squeue_workerdrain_ns
+squeue_drain_ms
+squeue_drain_ns
 squeue_workerwait_ms
 squeue_workerwait_tick
-squeue_writerdrain_ms
-squeue_writerdrain_ns
 tcp_acceptor_rinit
 tcp_acceptor_winit
 tcp_conn_cache
@@ -307,10 +291,8 @@ tcp_rinitv4
 tcp_rinitv6
 tcp_sack_info_cache
 tcp_sock_winit
-tcp_squeue_close
-tcp_squeue_close_proc
+tcp_squeue_flag
 tcp_squeue_wput
-tcp_squeue_wput_proc
 tcp_static_maxpsz
 tcp_taskq
 tcp_timercache
@@ -318,6 +300,7 @@ tcp_tx_pull_len
 tcp_valid_levels_arr
 tcp_winfo
 tcp_winit
+tcp_outbound_squeue_switch
 tcpinfov4
 tcpinfov6
 tsol_strict_error
diff --git a/usr/src/uts/intel/ip/ip.global-objs.obj64 b/usr/src/uts/intel/ip/ip.global-objs.obj64
index 065904b585..3866432363 100644
--- a/usr/src/uts/intel/ip/ip.global-objs.obj64
+++ b/usr/src/uts/intel/ip/ip.global-objs.obj64
@@ -44,7 +44,6 @@ cl_sctp_disconnect
 cl_sctp_listen
 cl_sctp_unlisten
 conn_drain_nthreads
-crctab
 default_ip6_asp_table
 do_tcp_direct_sockfs
 do_tcp_fusion
@@ -105,7 +104,6 @@ ip_cgtp_filter_rev
 ip_conn_cache
 ip_debug
 ip_g_all_ones
-ip_input_proc
 ip_ioctl_ftbl
 ip_ire_cleanup_cnt
 ip_ire_cpu_ratio
@@ -133,15 +131,12 @@ ip_poll_normal_ms
 ip_poll_normal_ticks
 ip_rput_pullups
 ip_six_byte_all_ones
-ip_soft_rings_cnt
-ip_squeue_bind
 ip_squeue_create_callback
 ip_squeue_enter
 ip_squeue_enter_unbound
 ip_squeue_fanout
-ip_squeue_profile
+ip_squeue_flag
 ip_squeue_worker_wait
-ip_squeues_per_cpu
 ip_thread_data
 ip_thread_list
 ip_thread_rwlock
@@ -217,10 +212,6 @@ req_arr
 rn_mkfreelist
 rn_ones
 rn_zeros
-rr_max_blank_ratio
-rr_max_pkt_cnt_ratio
-rr_min_blank_ratio
-rr_min_pkt_cnt_ratio
 rt_entry_cache
 rts_conn_cache
 rts_g_t_info_ack
@@ -254,16 +245,12 @@ sin6_null
 sin_null
 sqset_global_list
 sqset_global_size
+sqset_lock
 squeue_cache
-squeue_intrdrain_ms
-squeue_intrdrain_ns
-squeue_worker_poll_min
-squeue_workerdrain_ms
-squeue_workerdrain_ns
+squeue_drain_ms
+squeue_drain_ns
 squeue_workerwait_ms
 squeue_workerwait_tick
-squeue_writerdrain_ms
-squeue_writerdrain_ns
 tcp_acceptor_rinit
 tcp_acceptor_winit
 tcp_conn_cache
@@ -296,10 +283,8 @@ tcp_rinitv4
 tcp_rinitv6
 tcp_sack_info_cache
 tcp_sock_winit
-tcp_squeue_close
-tcp_squeue_close_proc
+tcp_squeue_flag
 tcp_squeue_wput
-tcp_squeue_wput_proc
 tcp_static_maxpsz
 tcp_taskq
 tcp_timercache
@@ -307,6 +292,7 @@ tcp_tx_pull_len
 tcp_valid_levels_arr
 tcp_winfo
 tcp_winit
+tcp_outbound_squeue_switch
 tcpinfov4
 tcpinfov6
 tsol_strict_error
diff --git a/usr/src/uts/intel/mac/Makefile b/usr/src/uts/intel/mac/Makefile
index 12bd648ee0..870b260f75 100644
--- a/usr/src/uts/intel/mac/Makefile
+++ b/usr/src/uts/intel/mac/Makefile
@@ -22,13 +22,10 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
-#
 #
 #	This makefile drives the production of the mac driver 
 #	kernel module.
 #
-
 #
 #	Path to the base of the uts directory tree (usually /usr/src/uts).
 #
@@ -53,7 +50,6 @@ include $(UTSBASE)/intel/Makefile.intel
 ALL_TARGET	= $(BINARY)
 LINT_TARGET	= $(MODULE).lint
 INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
-LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
 
 #
 #	Overrides.
@@ -61,6 +57,9 @@ LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
 CFLAGS		+= $(CCVERBOSE)
 LDFLAGS		+= -dy
 
+LINTTAGS	+= -erroff=E_PTRDIFF_OVERFLOW
+LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
+
 #
 #	Default build targets.
 #
diff --git a/usr/src/uts/intel/vnic/Makefile b/usr/src/uts/intel/vnic/Makefile
index 748d61a8b0..83a4c749c2 100644
--- a/usr/src/uts/intel/vnic/Makefile
+++ b/usr/src/uts/intel/vnic/Makefile
@@ -22,9 +22,6 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
-#
-
 #
 # Path to the base of the uts directory tree (usually /usr/src/uts).
 #
@@ -55,7 +52,7 @@ INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
 # Overrides
 #
 CFLAGS		+= $(CCVERBOSE)
-LDFLAGS		+= -dy -Ndrv/dld -Nmisc/mac -Ndrv/ip -Nmisc/dls
+LDFLAGS		+= -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls
 
 #
 #	Default build targets.
diff --git a/usr/src/uts/intel/xge/Makefile b/usr/src/uts/intel/xge/Makefile
index 6689f7a758..8541c1b052 100644
--- a/usr/src/uts/intel/xge/Makefile
+++ b/usr/src/uts/intel/xge/Makefile
@@ -20,11 +20,9 @@
 #
 
 #
-# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
-#
 #	This makefile drives the production of the Neterion Xframe
 #	10G Ethernet (XGE) driver module in x86 systems
 #
diff --git a/usr/src/uts/sparc/ip/Makefile b/usr/src/uts/sparc/ip/Makefile
index c330f273f9..515f079865 100644
--- a/usr/src/uts/sparc/ip/Makefile
+++ b/usr/src/uts/sparc/ip/Makefile
@@ -19,17 +19,15 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
 #
 #	This makefile drives the production of the ip driver 
 #	kernel module.
 #
 #	sparc architecture dependent
 #
-
 #
 #	Path to the base of the uts directory tree (usually /usr/src/uts).
 #
diff --git a/usr/src/uts/sparc/ip/ip.global-objs.debug64 b/usr/src/uts/sparc/ip/ip.global-objs.debug64
index 5854497325..f4bcb8ab0c 100644
--- a/usr/src/uts/sparc/ip/ip.global-objs.debug64
+++ b/usr/src/uts/sparc/ip/ip.global-objs.debug64
@@ -44,7 +44,6 @@ cl_sctp_disconnect
 cl_sctp_listen
 cl_sctp_unlisten
 conn_drain_nthreads
-crctab
 default_ip6_asp_table
 do_tcp_direct_sockfs
 do_tcp_fusion
@@ -105,7 +104,6 @@ ip_cgtp_filter_rev
 ip_conn_cache
 ip_debug
 ip_g_all_ones
-ip_input_proc
 ip_ioctl_ftbl
 ip_ire_cleanup_cnt
 ip_ire_cpu_ratio
@@ -133,15 +131,12 @@ ip_poll_normal_ms
 ip_poll_normal_ticks
 ip_rput_pullups
 ip_six_byte_all_ones
-ip_soft_rings_cnt
-ip_squeue_bind
 ip_squeue_create_callback
 ip_squeue_enter
 ip_squeue_enter_unbound
 ip_squeue_fanout
-ip_squeue_profile
+ip_squeue_flag
 ip_squeue_worker_wait
-ip_squeues_per_cpu
 ip_thread_data
 ip_thread_list
 ip_thread_rwlock
@@ -221,10 +216,6 @@ req_arr
 rn_mkfreelist
 rn_ones
 rn_zeros
-rr_max_blank_ratio
-rr_max_pkt_cnt_ratio
-rr_min_blank_ratio
-rr_min_pkt_cnt_ratio
 rt_entry_cache
 rts_conn_cache
 rts_g_t_info_ack
@@ -262,19 +253,12 @@ sin_null
 skip_sctp_cksum
 sqset_global_list
 sqset_global_size
+sqset_lock
 squeue_cache
-squeue_intrdrain_ms
-squeue_intrdrain_ns
-squeue_kstat
-squeue_kstat_lock
-squeue_profile
-squeue_worker_poll_min
-squeue_workerdrain_ms
-squeue_workerdrain_ns
+squeue_drain_ms
+squeue_drain_ns
 squeue_workerwait_ms
 squeue_workerwait_tick
-squeue_writerdrain_ms
-squeue_writerdrain_ns
 tcp_acceptor_rinit
 tcp_acceptor_winit
 tcp_conn_cache
@@ -307,10 +291,8 @@ tcp_rinitv4
 tcp_rinitv6
 tcp_sack_info_cache
 tcp_sock_winit
-tcp_squeue_close
-tcp_squeue_close_proc
+tcp_squeue_flag
 tcp_squeue_wput
-tcp_squeue_wput_proc
 tcp_static_maxpsz
 tcp_taskq
 tcp_timercache
@@ -318,6 +300,7 @@ tcp_tx_pull_len
 tcp_valid_levels_arr
 tcp_winfo
 tcp_winit
+tcp_outbound_squeue_switch
 tcpinfov4
 tcpinfov6
 tsol_strict_error
diff --git a/usr/src/uts/sparc/ip/ip.global-objs.obj64 b/usr/src/uts/sparc/ip/ip.global-objs.obj64
index 065904b585..3866432363 100644
--- a/usr/src/uts/sparc/ip/ip.global-objs.obj64
+++ b/usr/src/uts/sparc/ip/ip.global-objs.obj64
@@ -44,7 +44,6 @@ cl_sctp_disconnect
 cl_sctp_listen
 cl_sctp_unlisten
 conn_drain_nthreads
-crctab
 default_ip6_asp_table
 do_tcp_direct_sockfs
 do_tcp_fusion
@@ -105,7 +104,6 @@ ip_cgtp_filter_rev
 ip_conn_cache
 ip_debug
 ip_g_all_ones
-ip_input_proc
 ip_ioctl_ftbl
 ip_ire_cleanup_cnt
 ip_ire_cpu_ratio
@@ -133,15 +131,12 @@ ip_poll_normal_ms
 ip_poll_normal_ticks
 ip_rput_pullups
 ip_six_byte_all_ones
-ip_soft_rings_cnt
-ip_squeue_bind
 ip_squeue_create_callback
 ip_squeue_enter
 ip_squeue_enter_unbound
 ip_squeue_fanout
-ip_squeue_profile
+ip_squeue_flag
 ip_squeue_worker_wait
-ip_squeues_per_cpu
 ip_thread_data
 ip_thread_list
 ip_thread_rwlock
@@ -217,10 +212,6 @@ req_arr
 rn_mkfreelist
 rn_ones
 rn_zeros
-rr_max_blank_ratio
-rr_max_pkt_cnt_ratio
-rr_min_blank_ratio
-rr_min_pkt_cnt_ratio
 rt_entry_cache
 rts_conn_cache
 rts_g_t_info_ack
@@ -254,16 +245,12 @@ sin6_null
 sin_null
 sqset_global_list
 sqset_global_size
+sqset_lock
 squeue_cache
-squeue_intrdrain_ms
-squeue_intrdrain_ns
-squeue_worker_poll_min
-squeue_workerdrain_ms
-squeue_workerdrain_ns
+squeue_drain_ms
+squeue_drain_ns
 squeue_workerwait_ms
 squeue_workerwait_tick
-squeue_writerdrain_ms
-squeue_writerdrain_ns
 tcp_acceptor_rinit
 tcp_acceptor_winit
 tcp_conn_cache
@@ -296,10 +283,8 @@ tcp_rinitv4
 tcp_rinitv6
 tcp_sack_info_cache
 tcp_sock_winit
-tcp_squeue_close
-tcp_squeue_close_proc
+tcp_squeue_flag
 tcp_squeue_wput
-tcp_squeue_wput_proc
 tcp_static_maxpsz
 tcp_taskq
 tcp_timercache
@@ -307,6 +292,7 @@ tcp_tx_pull_len
 tcp_valid_levels_arr
 tcp_winfo
 tcp_winit
+tcp_outbound_squeue_switch
 tcpinfov4
 tcpinfov6
 tsol_strict_error
diff --git a/usr/src/uts/sparc/mac/Makefile b/usr/src/uts/sparc/mac/Makefile
index d343e0bc74..5ef314a2ef 100644
--- a/usr/src/uts/sparc/mac/Makefile
+++ b/usr/src/uts/sparc/mac/Makefile
@@ -22,14 +22,12 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
 #
 #	This makefile drives the production of the mac driver 
 #	kernel module.
 #
 #	sparc architecture dependent
 #
-
 #
 #	Path to the base of the uts directory tree (usually /usr/src/uts).
 #
@@ -54,7 +52,6 @@ include $(UTSBASE)/sparc/Makefile.sparc
 ALL_TARGET	= $(BINARY)
 LINT_TARGET	= $(MODULE).lint
 INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
-LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
 
 #
 #	Overrides.
@@ -64,6 +61,9 @@ $(RELEASE_BUILD)CFLAGS		+= -xinline=auto -xcrossfile
 $(RELEASE_BUILD)COPTIMIZE	= -xO5
 LDFLAGS				+= -dy
 
+LINTTAGS			+= -erroff=E_PTRDIFF_OVERFLOW
+LINTTAGS			+= -erroff=E_BAD_PTR_CAST_ALIGN
+
 #
 #	Default build targets.
 #
diff --git a/usr/src/uts/sparc/ml/modstubs.s b/usr/src/uts/sparc/ml/modstubs.s
index e45cd91325..e315c9857c 100644
--- a/usr/src/uts/sparc/ml/modstubs.s
+++ b/usr/src/uts/sparc/ml/modstubs.s
@@ -1199,6 +1199,8 @@ stubs_base:
 	MODULE(dld,drv);
 	STUB(dld, dld_init_ops, nomod_void);
 	STUB(dld, dld_fini_ops, nomod_void);
+	STUB(dld, dld_ioc_register, nomod_einval);
+	STUB(dld, dld_ioc_unregister, nomod_void);
 	STUB(dld, dld_autopush, nomod_minus_one);
 	END_MODULE(dld);
 #endif
@@ -1210,12 +1212,15 @@ stubs_base:
  */
 #ifndef	DLS_MODULE
 	MODULE(dls,misc);
-	STUB(dls, dls_devnet_vid, nomod_zero);
 	STUB(dls, dls_devnet_mac, nomod_zero);
 	STUB(dls, dls_devnet_hold_tmp, nomod_einval);
 	STUB(dls, dls_devnet_rele_tmp, nomod_void);
+	STUB(dls, dls_devnet_hold_link, nomod_einval);
+	STUB(dls, dls_devnet_rele_link, nomod_void);
 	STUB(dls, dls_devnet_prop_task_wait, nomod_void);
 	STUB(dls, dls_mgmt_get_linkid, nomod_einval);
+	STUB(dls, dls_devnet_macname2linkid, nomod_einval);
+	STUB(dls, dls_mgmt_get_linkinfo, nomod_einval);
 	END_MODULE(dls);
 #endif
 
diff --git a/usr/src/uts/sparc/vnic/Makefile b/usr/src/uts/sparc/vnic/Makefile
index f3389cb97a..41052c901d 100644
--- a/usr/src/uts/sparc/vnic/Makefile
+++ b/usr/src/uts/sparc/vnic/Makefile
@@ -22,9 +22,6 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
-#
-
 #
 # Path to the base of the uts directory tree (usually /usr/src/uts).
 #
@@ -55,7 +52,7 @@ INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
 # Overrides
 #
 CFLAGS		+= $(CCVERBOSE)
-LDFLAGS		+= -dy -Ndrv/dld -Nmisc/mac -Ndrv/ip -Nmisc/dls
+LDFLAGS		+= -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls
 
 #
 #	Default build targets.
diff --git a/usr/src/uts/sparc/xge/Makefile b/usr/src/uts/sparc/xge/Makefile
index 2d66030c07..f30c4612e3 100644
--- a/usr/src/uts/sparc/xge/Makefile
+++ b/usr/src/uts/sparc/xge/Makefile
@@ -20,11 +20,9 @@
 #
 
 #
-# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
-#
 #	This makefile drives the production of the Neterion Xframe
 #	10G Ethernet (XGE) driver module in x86 systems
 #
diff --git a/usr/src/uts/sun/io/eri/eri.c b/usr/src/uts/sun/io/eri/eri.c
index 0fac98abf1..7635d9553e 100644
--- a/usr/src/uts/sun/io/eri/eri.c
+++ b/usr/src/uts/sun/io/eri/eri.c
@@ -47,7 +47,7 @@
 #include	<sys/ethernet.h>
 #include	<sys/vlan.h>
 #include	<sys/policy.h>
-#include	<sys/mac.h>
+#include	<sys/mac_provider.h>
 #include	<sys/mac_ether.h>
 #include	<sys/dlpi.h>
 
@@ -200,7 +200,6 @@ static mac_callbacks_t eri_m_callbacks = {
 	eri_m_multicst,
 	eri_m_unicst,
 	eri_m_tx,
-	NULL,
 	eri_m_ioctl,
 	eri_m_getcapab
 };
@@ -1293,7 +1292,6 @@ eri_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 		*hcksum_txflags = HCKSUM_INET_PARTIAL;
 		return (B_TRUE);
 	}
-	case MAC_CAPAB_POLL:
 	default:
 		return (B_FALSE);
 	}
diff --git a/usr/src/uts/sun/io/hme.c b/usr/src/uts/sun/io/hme.c
index 399d995b10..0423d1d736 100644
--- a/usr/src/uts/sun/io/hme.c
+++ b/usr/src/uts/sun/io/hme.c
@@ -44,7 +44,7 @@
 #include	<sys/pattr.h>
 #include	<sys/dlpi.h>
 #include	<sys/strsubr.h>
-#include	<sys/mac.h>
+#include	<sys/mac_provider.h>
 #include	<sys/mac_ether.h>
 #include	<sys/ethernet.h>
 #include	<sys/vlan.h>
@@ -487,7 +487,6 @@ static mac_callbacks_t hme_m_callbacks = {
 	hme_m_multicst,
 	hme_m_unicst,
 	hme_m_tx,
-	NULL,
 	hme_m_ioctl,
 	hme_m_getcapab,
 };
diff --git a/usr/src/uts/sun/io/qfe.c b/usr/src/uts/sun/io/qfe.c
index 4a98701b87..ad9bfe8fee 100644
--- a/usr/src/uts/sun/io/qfe.c
+++ b/usr/src/uts/sun/io/qfe.c
@@ -36,7 +36,7 @@
 #include	<sys/kmem.h>
 #include	<sys/modctl.h>
 #include	<sys/conf.h>
-#include	<sys/mac.h>
+#include	<sys/mac_provider.h>
 #include	<sys/mac_ether.h>
 #include	<sys/ddi.h>
 #include	<sys/sunddi.h>
diff --git a/usr/src/uts/sun4u/io/rmclomv.c b/usr/src/uts/sun4u/io/rmclomv.c
index 2afee7d1dd..93e236b121 100644
--- a/usr/src/uts/sun4u/io/rmclomv.c
+++ b/usr/src/uts/sun4u/io/rmclomv.c
@@ -61,7 +61,6 @@
 #define	CPU_SIGNATURE_DELAY_TIME	5000000	 /* 5 secs, in microsecs */
 
 extern void	pmugpio_watchdog_pat();
-static clock_t	timesync_interval;
 
 extern int	watchdog_activated;
 static int	last_watchdog_msg = 1;
@@ -118,6 +117,10 @@ static uint_t rmc_clear_watchdog_timer(void);
 static void send_watchdog_msg(int msg);
 static void plat_timesync(void *arg);
 
+static kmutex_t		timesync_lock;
+static clock_t		timesync_interval = 0;
+static timeout_id_t	timesync_tid = 0;
+
 /*
  * Driver entry points
  */
@@ -310,6 +313,7 @@ _init(void)
 	mutex_init(&rmclomv_refresh_lock, NULL, MUTEX_DRIVER, NULL);
 	mutex_init(&rmclomv_cache_lock, NULL, MUTEX_DRIVER, NULL);
 	mutex_init(&rmclomv_state_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&timesync_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&rmclomv_checkrmc_sig_cv, NULL, CV_DRIVER, NULL);
 	cv_init(&rmclomv_refresh_sig_cv, NULL, CV_DRIVER, NULL);
 
@@ -344,6 +348,7 @@ _fini(void)
 		return (error);
 	cv_destroy(&rmclomv_refresh_sig_cv);
 	cv_destroy(&rmclomv_checkrmc_sig_cv);
+	mutex_destroy(&timesync_lock);
 	mutex_destroy(&rmclomv_state_lock);
 	mutex_destroy(&rmclomv_cache_lock);
 	mutex_destroy(&rmclomv_refresh_lock);
@@ -479,8 +484,9 @@ rmclomv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 static int
 rmclomv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 {
-	int	instance;
-	int	err;
+	timeout_id_t	tid;
+	int		instance;
+	int		err;
 
 	switch (cmd) {
 	case DDI_DETACH:
@@ -502,6 +508,13 @@ rmclomv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 		rmclomv_reset_cache(NULL, NULL, NULL);
 		ddi_remove_minor_node(dip, NULL);
 
+		mutex_enter(&timesync_lock);
+		tid = timesync_tid;
+		timesync_tid = 0;
+		timesync_interval = 0;
+		mutex_exit(&timesync_lock);
+		(void) untimeout(tid);
+
 		/* Forget the dev info */
 		rmclomv_dip = NULL;
 		rmc_comm_unregister();
@@ -3419,7 +3432,10 @@ plat_timesync(void *arg)
 
 	(void) rmc_comm_request_nowait(&request, 0);
 
-	(void) timeout(plat_timesync, NULL, timesync_interval);
+	mutex_enter(&timesync_lock);
+	if (timesync_interval != 0)
+		timesync_tid = timeout(plat_timesync, NULL, timesync_interval);
+	mutex_exit(&timesync_lock);
 }
 
 /*
diff --git a/usr/src/uts/sun4v/io/vnet.c b/usr/src/uts/sun4v/io/vnet.c
index 191cfba92b..64f3c278f5 100644
--- a/usr/src/uts/sun4v/io/vnet.c
+++ b/usr/src/uts/sun4v/io/vnet.c
@@ -39,7 +39,7 @@
 #include <sys/ethernet.h>
 #include <sys/dlpi.h>
 #include <net/if.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
diff --git a/usr/src/uts/sun4v/io/vnet_gen.c b/usr/src/uts/sun4v/io/vnet_gen.c
index 2a273019b8..b6671a36ad 100644
--- a/usr/src/uts/sun4v/io/vnet_gen.c
+++ b/usr/src/uts/sun4v/io/vnet_gen.c
@@ -42,7 +42,7 @@
 #include <sys/sunddi.h>
 #include <sys/strsun.h>
 #include <sys/note.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/ldc.h>
 #include <sys/mach_descrip.h>
diff --git a/usr/src/uts/sun4v/io/vsw.c b/usr/src/uts/sun4v/io/vsw.c
index 27ad33ff66..fc3fdceeeb 100644
--- a/usr/src/uts/sun4v/io/vsw.c
+++ b/usr/src/uts/sun4v/io/vsw.c
@@ -53,12 +53,12 @@
 #include <sys/machsystm.h>
 #include <sys/modctl.h>
 #include <sys/modhash.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/taskq.h>
 #include <sys/note.h>
 #include <sys/mach_descrip.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mdeg.h>
 #include <sys/ldc.h>
 #include <sys/vsw_fdb.h>
@@ -78,7 +78,7 @@
 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
 static	int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *);
-static	int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *);
+static	int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *);
 
 /* MDEG routines */
 static	int vsw_mdeg_register(vsw_t *vswp);
@@ -88,7 +88,7 @@ static	int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *);
 static	int vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t);
 static	int vsw_read_mdprops(vsw_t *vswp);
 static	void vsw_vlan_read_ids(void *arg, int type, md_t *mdp,
-	mde_cookie_t node, uint16_t *pvidp, uint16_t **vidspp,
+	mde_cookie_t node, uint16_t *pvidp, vsw_vlanid_t **vidspp,
 	uint16_t *nvidsp, uint16_t *default_idp);
 static	int vsw_port_read_props(vsw_port_t *portp, vsw_t *vswp,
 	md_t *mdp, mde_cookie_t *node);
@@ -99,6 +99,8 @@ static	void vsw_mtu_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
 static	int vsw_mtu_update(vsw_t *vswp, uint32_t mtu);
 static	void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t);
 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
+static boolean_t vsw_cmp_vids(vsw_vlanid_t *vids1,
+	vsw_vlanid_t *vids2, int nvids);
 
 /* Mac driver related routines */
 static int vsw_mac_register(vsw_t *);
@@ -132,13 +134,9 @@ static int vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex,
 	md_t *prev_mdp, mde_cookie_t prev_mdex);
 extern	int vsw_port_attach(vsw_port_t *port);
 extern vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
-extern int vsw_mac_attach(vsw_t *vswp);
-extern void vsw_mac_detach(vsw_t *vswp);
 extern int vsw_mac_open(vsw_t *vswp);
 extern void vsw_mac_close(vsw_t *vswp);
-extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
-extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
-extern void vsw_reconfig_hw(vsw_t *);
+extern void vsw_mac_cleanup_ports(vsw_t *vswp);
 extern void vsw_unset_addrs(vsw_t *vswp);
 extern void vsw_setup_layer2_post_process(vsw_t *vswp);
 extern void vsw_create_vlans(void *arg, int type);
@@ -150,6 +148,16 @@ extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
 	mblk_t **npt);
 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
 extern void vsw_hio_cleanup(vsw_t *vswp);
+extern void vsw_hio_start_ports(vsw_t *vswp);
+extern void vsw_hio_port_update(vsw_port_t *portp, boolean_t hio_enabled);
+extern int vsw_mac_multicast_add(vsw_t *, vsw_port_t *, mcst_addr_t *, int);
+extern void vsw_mac_multicast_remove(vsw_t *, vsw_port_t *, mcst_addr_t *, int);
+extern void vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid,
+    vsw_vlanid_t *new_vids, int new_nvids);
+extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
+extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
+extern void vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans,
+    uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids);
 extern void vsw_reset_ports(vsw_t *vswp);
 extern void vsw_port_reset(vsw_port_t *portp);
 void vsw_hio_port_update(vsw_port_t *portp, boolean_t hio_enabled);
@@ -223,16 +231,6 @@ boolean_t vsw_hio_enabled = B_TRUE;	/* Enable/disable HybridIO */
 int vsw_hio_max_cleanup_retries = 10;	/* Max retries for HybridIO cleanp */
 int vsw_hio_cleanup_delay = 10000;	/* 10ms */
 
-/*
- * External tunables.
- */
-/*
- * Enable/disable thread per ring. This is a mode selection
- * that is done a vsw driver attach time.
- */
-boolean_t vsw_multi_ring_enable = B_FALSE;
-int vsw_mac_rx_rings = VSW_MAC_RX_RINGS;
-
 /* Number of transmit descriptors -  must be power of 2 */
 uint32_t vsw_ntxds = VSW_RING_NUM_EL;
 
@@ -543,11 +541,11 @@ vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	vswp->instance = instance;
 	ddi_set_driver_private(dip, (caddr_t)vswp);
 
-	mutex_init(&vswp->hw_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL);
 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
 	mutex_init(&vswp->swtmout_lock, NULL, MUTEX_DRIVER, NULL);
+	rw_init(&vswp->maccl_rwlock, NULL, RW_DRIVER, NULL);
 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
-	rw_init(&vswp->mac_rwlock, NULL, RW_DRIVER, NULL);
 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
 
@@ -669,10 +667,9 @@ vsw_attach_fail:
 	if (progress & PROG_swmode) {
 		vsw_stop_switching_timeout(vswp);
 		vsw_hio_cleanup(vswp);
-		WRITE_ENTER(&vswp->mac_rwlock);
-		vsw_mac_detach(vswp);
+		mutex_enter(&vswp->mac_lock);
 		vsw_mac_close(vswp);
-		RW_EXIT(&vswp->mac_rwlock);
+		mutex_exit(&vswp->mac_lock);
 	}
 
 	if (progress & PROG_taskq)
@@ -697,11 +694,11 @@ vsw_attach_fail:
 	if (progress & PROG_locks) {
 		rw_destroy(&vswp->plist.lockrw);
 		rw_destroy(&vswp->mfdbrw);
-		rw_destroy(&vswp->mac_rwlock);
 		rw_destroy(&vswp->if_lockrw);
+		rw_destroy(&vswp->maccl_rwlock);
 		mutex_destroy(&vswp->swtmout_lock);
 		mutex_destroy(&vswp->mca_lock);
-		mutex_destroy(&vswp->hw_lock);
+		mutex_destroy(&vswp->mac_lock);
 	}
 
 	ddi_soft_state_free(vsw_state, instance);
@@ -736,6 +733,9 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 	/* Stop any pending timeout to setup switching mode. */
 	vsw_stop_switching_timeout(vswp);
 
+	/* Cleanup the interface's mac client */
+	vsw_mac_client_cleanup(vswp, NULL, VSW_LOCALDEV);
+
 	if (vswp->if_state & VSW_IF_REG) {
 		if (vsw_mac_unregister(vswp) != 0) {
 			cmn_err(CE_WARN, "!vsw%d: Unable to detach from "
@@ -746,13 +746,8 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 
 	vsw_mdeg_unregister(vswp);
 
-	/* remove mac layer callback */
-	WRITE_ENTER(&vswp->mac_rwlock);
-	if ((vswp->mh != NULL) && (vswp->mrh != NULL)) {
-		mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE);
-		vswp->mrh = NULL;
-	}
-	RW_EXIT(&vswp->mac_rwlock);
+	/* cleanup HybridIO */
+	vsw_hio_cleanup(vswp);
 
 	if (vsw_detach_ports(vswp) != 0) {
 		cmn_err(CE_WARN, "!vsw%d: Unable to unconfigure ports",
@@ -762,24 +757,19 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 
 	rw_destroy(&vswp->if_lockrw);
 
-	/* cleanup HybridIO */
-	vsw_hio_cleanup(vswp);
-
-	mutex_destroy(&vswp->hw_lock);
+	vsw_mac_cleanup_ports(vswp);
 
 	/*
 	 * Now that the ports have been deleted, stop and close
 	 * the physical device.
 	 */
-	WRITE_ENTER(&vswp->mac_rwlock);
-
-	vsw_mac_detach(vswp);
+	mutex_enter(&vswp->mac_lock);
 	vsw_mac_close(vswp);
+	mutex_exit(&vswp->mac_lock);
 
-	RW_EXIT(&vswp->mac_rwlock);
-
-	rw_destroy(&vswp->mac_rwlock);
+	mutex_destroy(&vswp->mac_lock);
 	mutex_destroy(&vswp->swtmout_lock);
+	rw_destroy(&vswp->maccl_rwlock);
 
 	/*
 	 * Destroy any free pools that may still exist.
@@ -936,15 +926,12 @@ vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name)
 /*
  * Read the 'vsw-switch-mode' property from the specified MD node.
  *
- * Returns 0 on success and the number of modes found in 'found',
- * otherwise returns 1.
+ * Returns 0 on success, otherwise returns 1.
  */
 static int
-vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
-						uint8_t *modes, int *found)
+vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, uint8_t *mode)
 {
 	int		len = 0;
-	int		smode_num = 0;
 	char		*smode = NULL;
 	char		*curr_mode = NULL;
 
@@ -956,7 +943,6 @@ vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
 	 * first item in list.
 	 */
 	len = 0;
-	smode_num = 0;
 	if (md_get_prop_data(mdp, node, smode_propname,
 	    (uint8_t **)(&smode), &len) != 0) {
 		/*
@@ -965,7 +951,6 @@ vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
 		 */
 		cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property"
 		    " from the MD", vswp->instance);
-		*found = 0;
 		return (1);
 	}
 
@@ -979,25 +964,24 @@ vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
 	 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
 	 *			in non-promiscuous mode.
 	 */
-	while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) {
+	while (curr_mode < (smode + len)) {
 		D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
 		if (strcmp(curr_mode, "switched") == 0) {
-			modes[smode_num++] = VSW_LAYER2;
+			*mode = VSW_LAYER2;
 		} else if (strcmp(curr_mode, "promiscuous") == 0) {
-			modes[smode_num++] = VSW_LAYER2_PROMISC;
+			*mode = VSW_LAYER2 | VSW_LAYER2_PROMISC;
 		} else if (strcmp(curr_mode, "routed") == 0) {
-			modes[smode_num++] = VSW_LAYER3;
+			*mode = VSW_LAYER3;
 		} else {
-			DWARN(vswp, "%s: Unknown switch mode %s, "
-			    "setting to default 'switched' mode",
-			    __func__, curr_mode);
-			modes[smode_num++] = VSW_LAYER2;
+			cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, "
+			    "setting to default switched mode",
+			    vswp->instance, curr_mode);
+			*mode = VSW_LAYER2;
 		}
 		curr_mode += strlen(curr_mode) + 1;
 	}
-	*found = smode_num;
 
-	D2(vswp, "%s: %d modes found", __func__, smode_num);
+	D2(vswp, "%s: %d mode", __func__, *mode);
 
 	D1(vswp, "%s: exit", __func__);
 
@@ -1082,16 +1066,16 @@ vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
 
 	D1(vswp, "%s: enter", __func__);
 
-	WRITE_ENTER(&vswp->mac_rwlock);
+	mutex_enter(&vswp->mac_lock);
 	if (vswp->mh == NULL) {
-		RW_EXIT(&vswp->mac_rwlock);
+		mutex_exit(&vswp->mac_lock);
 		return (EINVAL);
 	}
 
 	/* return stats from underlying device */
 	*val = mac_stat_get(vswp->mh, stat);
 
-	RW_EXIT(&vswp->mac_rwlock);
+	mutex_exit(&vswp->mac_lock);
 
 	return (0);
 }
@@ -1107,14 +1091,8 @@ vsw_m_stop(void *arg)
 	vswp->if_state &= ~VSW_IF_UP;
 	RW_EXIT(&vswp->if_lockrw);
 
-	mutex_enter(&vswp->hw_lock);
-
-	(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
-
-	if (vswp->recfg_reqd)
-		vsw_reconfig_hw(vswp);
-
-	mutex_exit(&vswp->hw_lock);
+	/* Cleanup and close the mac client */
+	vsw_mac_client_cleanup(vswp, NULL, VSW_LOCALDEV);
 
 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
 }
@@ -1122,6 +1100,7 @@ vsw_m_stop(void *arg)
 static int
 vsw_m_start(void *arg)
 {
+	int		rv;
 	vsw_t		*vswp = (vsw_t *)arg;
 
 	D1(vswp, "%s: enter", __func__);
@@ -1143,9 +1122,13 @@ vsw_m_start(void *arg)
 
 	/* if in layer2 mode, program unicast address. */
 	if (vswp->mh != NULL) {
-		mutex_enter(&vswp->hw_lock);
-		(void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
-		mutex_exit(&vswp->hw_lock);
+		/* Init a mac client and program addresses */
+		rv = vsw_mac_client_init(vswp, NULL, VSW_LOCALDEV);
+		if (rv != 0) {
+			cmn_err(CE_NOTE,
+			    "!vsw%d: failed to program interface "
+			    "unicast address\n", vswp->instance);
+		}
 	}
 
 	RW_EXIT(&vswp->if_lockrw);
@@ -1211,29 +1194,21 @@ vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
 			 * Call into the underlying driver to program the
 			 * address into HW.
 			 */
-			WRITE_ENTER(&vswp->mac_rwlock);
-			if (vswp->mh != NULL) {
-				ret = mac_multicst_add(vswp->mh, mca);
-				if (ret != 0) {
-					cmn_err(CE_NOTE, "!vsw%d: unable to "
-					    "add multicast address",
-					    vswp->instance);
-					RW_EXIT(&vswp->mac_rwlock);
-					(void) vsw_del_mcst(vswp,
-					    VSW_LOCALDEV, addr, NULL);
-					kmem_free(mcst_p, sizeof (*mcst_p));
-					return (ret);
-				}
-				mcst_p->mac_added = B_TRUE;
+			ret = vsw_mac_multicast_add(vswp, NULL, mcst_p,
+			    VSW_LOCALDEV);
+			if (ret != 0) {
+				(void) vsw_del_mcst(vswp,
+				    VSW_LOCALDEV, addr, NULL);
+				kmem_free(mcst_p, sizeof (*mcst_p));
+				return (ret);
 			}
-			RW_EXIT(&vswp->mac_rwlock);
 
 			mutex_enter(&vswp->mca_lock);
 			mcst_p->nextp = vswp->mcap;
 			vswp->mcap = mcst_p;
 			mutex_exit(&vswp->mca_lock);
 		} else {
-			cmn_err(CE_NOTE, "!vsw%d: unable to add multicast "
+			cmn_err(CE_WARN, "!vsw%d: unable to add multicast "
 			    "address", vswp->instance);
 		}
 		return (ret);
@@ -1252,12 +1227,7 @@ vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
 		mcst_p = vsw_del_addr(VSW_LOCALDEV, vswp, addr);
 		ASSERT(mcst_p != NULL);
 
-		WRITE_ENTER(&vswp->mac_rwlock);
-		if (vswp->mh != NULL && mcst_p->mac_added) {
-			(void) mac_multicst_remove(vswp->mh, mca);
-			mcst_p->mac_added = B_FALSE;
-		}
-		RW_EXIT(&vswp->mac_rwlock);
+		vsw_mac_multicast_remove(vswp, NULL, mcst_p, VSW_LOCALDEV);
 		kmem_free(mcst_p, sizeof (*mcst_p));
 	}
 
@@ -1685,8 +1655,7 @@ vsw_readmd_exit:
 static int
 vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
 {
-	int		i;
-	uint64_t 	macaddr = 0;
+	uint64_t	macaddr = 0;
 
 	D1(vswp, "%s: enter", __func__);
 
@@ -1703,17 +1672,12 @@ vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
 
 	vsw_save_lmacaddr(vswp, macaddr);
 
-	if (vsw_get_md_smodes(vswp, mdp, node, vswp->smode, &vswp->smode_num)) {
+	if (vsw_get_md_smodes(vswp, mdp, node, &vswp->smode)) {
 		DWARN(vswp, "%s: Unable to read %s property from MD, "
 		    "defaulting to 'switched' mode",
 		    __func__, smode_propname);
 
-		for (i = 0; i < NUM_SMODES; i++)
-			vswp->smode[i] = VSW_LAYER2;
-
-		vswp->smode_num = NUM_SMODES;
-	} else {
-		ASSERT(vswp->smode_num != 0);
+		vswp->smode = VSW_LAYER2;
 	}
 
 	/* read mtu */
@@ -1751,7 +1715,7 @@ vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
  */
 static void
 vsw_vlan_read_ids(void *arg, int type, md_t *mdp, mde_cookie_t node,
-	uint16_t *pvidp, uint16_t **vidspp, uint16_t *nvidsp,
+	uint16_t *pvidp, vsw_vlanid_t **vidspp, uint16_t *nvidsp,
 	uint16_t *default_idp)
 {
 	vsw_t		*vswp;
@@ -1823,11 +1787,12 @@ vsw_vlan_read_ids(void *arg, int type, md_t *mdp, mde_cookie_t node,
 
 	if (nvids != 0) {
 		D2(vswp, "%s: %s(%d): ", __func__, vid_propname, inst);
-		vids_size = sizeof (uint16_t) * nvids;
+		vids_size = sizeof (vsw_vlanid_t) * nvids;
 		*vidspp = kmem_zalloc(vids_size, KM_SLEEP);
 		for (i = 0; i < nvids; i++) {
-			(*vidspp)[i] = data[i] & 0xFFFF;
-			D2(vswp, " %d ", (*vidspp)[i]);
+			(*vidspp)[i].vl_vid = data[i] & 0xFFFF;
+			(*vidspp)[i].vl_set = B_FALSE;
+			D2(vswp, " %d ", (*vidspp)[i].vl_vid);
 		}
 		D2(vswp, "\n");
 	}
@@ -1959,35 +1924,6 @@ vsw_mtu_update(vsw_t *vswp, uint32_t mtu)
 
 		RW_EXIT(&vswp->if_lockrw);
 
-		WRITE_ENTER(&vswp->mac_rwlock);
-
-		if (vswp->mh == 0) {
-			/*
-			 * Physical device is not available yet; mtu will be
-			 * updated after we open it successfully, as we have
-			 * saved the new mtu.
-			 */
-			D2(vswp, "%s: Physical device:%s is not "
-			    "available yet; can't update its mtu\n",
-			    __func__, vswp->physname);
-
-		} else {
-
-			/*
-			 * Stop and restart to enable the
-			 * new mtu in the physical device.
-			 */
-			vsw_mac_detach(vswp);
-			rv = vsw_mac_attach(vswp);
-			if (rv != 0) {
-				RW_EXIT(&vswp->mac_rwlock);
-				return (EIO);
-			}
-
-		}
-
-		RW_EXIT(&vswp->mac_rwlock);
-
 		/* Reset ports to renegotiate with the new mtu */
 		vsw_reset_ports(vswp);
 
@@ -2014,8 +1950,8 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
 	char		physname[LIFNAMSIZ];
 	char		drv[LIFNAMSIZ];
 	uint_t		ddi_instance;
-	uint8_t		new_smode[NUM_SMODES];
-	int		i, smode_num = 0;
+	uint8_t		new_smode;
+	int		i;
 	uint64_t 	macaddr = 0;
 	enum		{MD_init = 0x1,
 				MD_physname = 0x2,
@@ -2025,7 +1961,7 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
 				MD_mtu = 0x20} updated;
 	int		rv;
 	uint16_t	pvid;
-	uint16_t	*vids;
+	vsw_vlanid_t	*vids;
 	uint16_t	nvids;
 	uint32_t	mtu;
 
@@ -2099,25 +2035,16 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
 	/*
 	 * Check if switching modes have changed.
 	 */
-	if (vsw_get_md_smodes(vswp, mdp, node,
-	    new_smode, &smode_num)) {
+	if (vsw_get_md_smodes(vswp, mdp, node, &new_smode)) {
 		cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD",
 		    vswp->instance, smode_propname);
 		goto fail_reconf;
 	} else {
-		ASSERT(smode_num != 0);
-		if (smode_num != vswp->smode_num) {
-			D2(vswp, "%s: number of modes changed from %d to %d",
-			    __func__, vswp->smode_num, smode_num);
-		}
+		if (new_smode != vswp->smode) {
+			D2(vswp, "%s: switching mode changed from %d to %d",
+			    __func__, vswp->smode, new_smode);
 
-		for (i = 0; i < smode_num; i++) {
-			if (new_smode[i] != vswp->smode[i]) {
-				D2(vswp, "%s: mode changed from %d to %d",
-				    __func__, vswp->smode[i], new_smode[i]);
-				updated |= MD_smode;
-				break;
-			}
+			updated |= MD_smode;
 		}
 	}
 
@@ -2129,7 +2056,7 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
 	if ((pvid != vswp->pvid) ||		/* pvid changed? */
 	    (nvids != vswp->nvids) ||		/* # of vids changed? */
 	    ((nvids != 0) && (vswp->nvids != 0) &&	/* vids changed? */
-	    bcmp(vids, vswp->vids, sizeof (uint16_t) * nvids))) {
+	    !vsw_cmp_vids(vids, vswp->vids, nvids))) {
 		updated |= MD_vlans;
 	}
 
@@ -2149,7 +2076,7 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
 	 * Now make any changes which are needed...
 	 */
 
-	if (updated & (MD_physname | MD_smode)) {
+	if (updated & (MD_physname | MD_smode | MD_mtu)) {
 
 		/*
 		 * Stop any pending timeout to setup switching mode.
@@ -2161,19 +2088,17 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
 
 		/*
 		 * Remove unicst, mcst addrs of vsw interface
-		 * and ports from the physdev.
+		 * and ports from the physdev. This also closes
+		 * the corresponding mac clients.
 		 */
 		vsw_unset_addrs(vswp);
 
 		/*
 		 * Stop, detach and close the old device..
 		 */
-		WRITE_ENTER(&vswp->mac_rwlock);
-
-		vsw_mac_detach(vswp);
+		mutex_enter(&vswp->mac_lock);
 		vsw_mac_close(vswp);
-
-		RW_EXIT(&vswp->mac_rwlock);
+		mutex_exit(&vswp->mac_lock);
 
 		/*
 		 * Update phys name.
@@ -2189,11 +2114,15 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
 		 * Update array with the new switch mode values.
 		 */
 		if (updated & MD_smode) {
-			for (i = 0; i < smode_num; i++)
-				vswp->smode[i] = new_smode[i];
+			vswp->smode = new_smode;
+		}
 
-			vswp->smode_num = smode_num;
-			vswp->smode_idx = 0;
+		/* Update mtu */
+		if (updated & MD_mtu) {
+			rv = vsw_mtu_update(vswp, mtu);
+			if (rv != 0) {
+				goto fail_update;
+			}
 		}
 
 		/*
@@ -2237,24 +2166,9 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
 
 		READ_ENTER(&vswp->if_lockrw);
 		if (vswp->if_state & VSW_IF_UP) {
+			/* reconfigure with new address */
+			vsw_if_mac_reconfig(vswp, B_FALSE, 0, NULL, 0);
 
-			mutex_enter(&vswp->hw_lock);
-			/*
-			 * Remove old mac address of vsw interface
-			 * from the physdev
-			 */
-			(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
-			/*
-			 * Program new mac address of vsw interface
-			 * in the physdev
-			 */
-			rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
-			mutex_exit(&vswp->hw_lock);
-			if (rv != 0) {
-				cmn_err(CE_NOTE,
-				    "!vsw%d: failed to program interface "
-				    "unicast address\n", vswp->instance);
-			}
 			/*
 			 * Notify the MAC layer of the changed address.
 			 */
@@ -2270,32 +2184,24 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
 		/* Remove existing vlan ids from the hash table. */
 		vsw_vlan_remove_ids(vswp, VSW_LOCALDEV);
 
-		/* save the new vlan ids */
-		vswp->pvid = pvid;
-		if (vswp->nvids != 0) {
-			kmem_free(vswp->vids, sizeof (uint16_t) * vswp->nvids);
-			vswp->nvids = 0;
-		}
-		if (nvids != 0) {
-			vswp->nvids = nvids;
+		if (vswp->if_state & VSW_IF_UP) {
+			vsw_if_mac_reconfig(vswp, B_TRUE, pvid, vids, nvids);
+		} else {
+			if (vswp->nvids != 0) {
+				kmem_free(vswp->vids,
+				    sizeof (vsw_vlanid_t) * vswp->nvids);
+			}
 			vswp->vids = vids;
+			vswp->nvids = nvids;
+			vswp->pvid = pvid;
 		}
 
 		/* add these new vlan ids into hash table */
 		vsw_vlan_add_ids(vswp, VSW_LOCALDEV);
 	} else {
 		if (nvids != 0) {
-			kmem_free(vids, sizeof (uint16_t) * nvids);
-		}
-	}
-
-	if (updated & MD_mtu) {
-
-		rv = vsw_mtu_update(vswp, mtu);
-		if (rv != 0) {
-			goto fail_update;
+			kmem_free(vids, sizeof (vsw_vlanid_t) * nvids);
 		}
-
 	}
 
 	return;
@@ -2397,7 +2303,7 @@ vsw_port_read_props(vsw_port_t *portp, vsw_t *vswp,
 	/* now update all properties into the port */
 	portp->p_vswp = vswp;
 	portp->p_instance = inst;
-	portp->addr_set = VSW_ADDR_UNSET;
+	portp->addr_set = B_FALSE;
 	ether_copy(&ea, &portp->p_macaddr);
 	if (nchan > VSW_PORT_MAX_LDCS) {
 		D2(vswp, "%s: using first of %d ldc ids",
@@ -2466,7 +2372,7 @@ vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex,
 	vsw_port_t	*portp;
 	boolean_t	updated_vlans = B_FALSE;
 	uint16_t	pvid;
-	uint16_t	*vids;
+	vsw_vlanid_t	*vids;
 	uint16_t	nvids;
 	uint64_t	val;
 	boolean_t	hio_enabled = B_FALSE;
@@ -2503,7 +2409,7 @@ vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex,
 	if ((pvid != portp->pvid) ||		/* pvid changed? */
 	    (nvids != portp->nvids) ||		/* # of vids changed? */
 	    ((nvids != 0) && (portp->nvids != 0) &&	/* vids changed? */
-	    bcmp(vids, portp->vids, sizeof (uint16_t) * nvids))) {
+	    !vsw_cmp_vids(vids, portp->vids, nvids))) {
 		updated_vlans = B_TRUE;
 	}
 
@@ -2512,20 +2418,8 @@ vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex,
 		/* Remove existing vlan ids from the hash table. */
 		vsw_vlan_remove_ids(portp, VSW_VNETPORT);
 
-		/* save the new vlan ids */
-		portp->pvid = pvid;
-		if (portp->nvids != 0) {
-			kmem_free(portp->vids,
-			    sizeof (uint16_t) * portp->nvids);
-			portp->nvids = 0;
-		}
-		if (nvids != 0) {
-			portp->vids = kmem_zalloc(sizeof (uint16_t) *
-			    nvids, KM_SLEEP);
-			bcopy(vids, portp->vids, sizeof (uint16_t) * nvids);
-			portp->nvids = nvids;
-			kmem_free(vids, sizeof (uint16_t) * nvids);
-		}
+		/* Reconfigure vlans with network device */
+		vsw_mac_port_reconfig_vlans(portp, pvid, vids, nvids);
 
 		/* add these new vlan ids into hash table */
 		vsw_vlan_add_ids(portp, VSW_VNETPORT);
@@ -2628,3 +2522,23 @@ vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr)
 	}
 	RW_EXIT(&vswp->if_lockrw);
 }
+
+/* Compare VLAN ids, array size expected to be same. */
+static boolean_t
+vsw_cmp_vids(vsw_vlanid_t *vids1, vsw_vlanid_t *vids2, int nvids)
+{
+	int i, j;
+	uint16_t vid;
+
+	for (i = 0; i < nvids; i++) {
+		vid = vids1[i].vl_vid;
+		for (j = 0; j < nvids; j++) {
+			if (vid == vids2[i].vl_vid)
+				break;
+		}
+		if (j == nvids) {
+			return (B_FALSE);
+		}
+	}
+	return (B_TRUE);
+}
diff --git a/usr/src/uts/sun4v/io/vsw_hio.c b/usr/src/uts/sun4v/io/vsw_hio.c
index 278896d977..084c338548 100644
--- a/usr/src/uts/sun4v/io/vsw_hio.c
+++ b/usr/src/uts/sun4v/io/vsw_hio.c
@@ -53,7 +53,7 @@
 #include <sys/machsystm.h>
 #include <sys/modctl.h>
 #include <sys/modhash.h>
-#include <sys/mac.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/taskq.h>
 #include <sys/note.h>
@@ -80,9 +80,9 @@ extern int vsw_hio_cleanup_delay;
 
 /* Functions imported from other files */
 extern int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
-extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
-extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
 extern void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
+extern void vsw_port_mac_reconfig(vsw_port_t *portp, boolean_t update_vlans,
+    uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids);
 
 /* Functions exported to other files */
 void vsw_hio_init(vsw_t *vswp);
@@ -104,11 +104,24 @@ static int vsw_send_dds_msg(vsw_ldc_t *ldcp, uint8_t dds_subclass,
     uint64_t cookie, uint64_t macaddr, uint32_t req_id);
 static int vsw_send_dds_resp_msg(vsw_ldc_t *ldcp, vio_dds_msg_t *dmsg, int ack);
 static int vsw_hio_send_delshare_msg(vsw_share_t *vsharep);
-static int vsw_hio_bind_macaddr(vsw_share_t *vsharep);
-static void vsw_hio_unbind_macaddr(vsw_share_t *vsharep);
 static boolean_t vsw_hio_reboot_callb(void *arg, int code);
 static boolean_t vsw_hio_panic_callb(void *arg, int code);
 
+/*
+ * Locking strategy for HybridIO is followed as below:
+ *
+ *	- As the Shares are associated with a network device, the
+ *	  the global lock('vswp>mac_lock') is used for all Shares
+ *	  related operations.
+ *	- The 'port->maccl_rwlock' is used to synchronize only the
+ *	  the operations that operate on that port's mac client. That
+ *	  is, the share_bind and unbind operations only.
+ *
+ *	- The locking hierarchy follows that the global mac_lock is
+ *	  acquired first and then the ports mac client lock(maccl_rwlock)
+ */
+
+
 static kstat_t *vsw_hio_setup_kstats(char *ks_mod, char *ks_name, vsw_t *vswp);
 static void vsw_hio_destroy_kstats(vsw_t *vswp);
 static int vsw_hio_kstats_update(kstat_t *ksp, int rw);
@@ -122,32 +135,23 @@ void
 vsw_hio_init(vsw_t *vswp)
 {
 	vsw_hio_t	*hiop = &vswp->vhio;
+	int		num_shares;
 	int		i;
-	int		rv;
 
+	ASSERT(MUTEX_HELD(&vswp->mac_lock));
 	D1(vswp, "%s:enter\n", __func__);
-	mutex_enter(&vswp->hw_lock);
 	if (vsw_hio_enabled == B_FALSE) {
-		mutex_exit(&vswp->hw_lock);
 		return;
 	}
 
 	vswp->hio_capable = B_FALSE;
-	rv = mac_capab_get(vswp->mh, MAC_CAPAB_SHARES, &hiop->vh_scapab);
-	if (rv == B_FALSE) {
+	num_shares = mac_share_capable(vswp->mh);
+	if (num_shares == 0) {
 		D2(vswp, "%s: %s is not HybridIO capable\n", __func__,
 		    vswp->physname);
-		mutex_exit(&vswp->hw_lock);
 		return;
 	}
-	rv = mac_capab_get(vswp->mh, MAC_CAPAB_RINGS, &hiop->vh_rcapab);
-	if (rv == B_FALSE) {
-		DWARN(vswp, "%s: %s has no RINGS capability\n", __func__,
-		    vswp->physname);
-		mutex_exit(&vswp->hw_lock);
-		return;
-	}
-	hiop->vh_num_shares = hiop->vh_scapab.ms_snum;
+	hiop->vh_num_shares = num_shares;
 	hiop->vh_shares = kmem_zalloc((sizeof (vsw_share_t) *
 	    hiop->vh_num_shares), KM_SLEEP);
 	for (i = 0; i < hiop->vh_num_shares; i++) {
@@ -176,7 +180,6 @@ vsw_hio_init(vsw_t *vswp)
 	D2(vswp, "%s: %s is HybridIO capable num_shares=%d\n", __func__,
 	    vswp->physname, hiop->vh_num_shares);
 	D1(vswp, "%s:exit\n", __func__);
-	mutex_exit(&vswp->hw_lock);
 }
 
 /*
@@ -187,13 +190,9 @@ vsw_hio_init(vsw_t *vswp)
 static vsw_share_t *
 vsw_hio_alloc_share(vsw_t *vswp, vsw_ldc_t *ldcp)
 {
-	vsw_hio_t	*hiop = &vswp->vhio;
-	mac_capab_share_t *hcapab = &hiop->vh_scapab;
 	vsw_share_t	*vsharep;
 	vsw_port_t	*portp = ldcp->ldc_port;
 	uint64_t	ldc_id = ldcp->ldc_id;
-	uint32_t	rmin, rmax;
-	uint64_t	rmap;
 	int		rv;
 
 	D1(vswp, "%s:enter\n", __func__);
@@ -202,39 +201,19 @@ vsw_hio_alloc_share(vsw_t *vswp, vsw_ldc_t *ldcp)
 		/* No free shares available */
 		return (NULL);
 	}
-	/*
-	 * Allocate a Share - it will come with rings/groups
-	 * already assigned to it.
-	 */
-	rv = hcapab->ms_salloc(hcapab->ms_handle, ldc_id,
-	    &vsharep->vs_cookie, &vsharep->vs_shdl);
+
+	WRITE_ENTER(&portp->maccl_rwlock);
+	rv = mac_share_bind(portp->p_mch, ldc_id, &vsharep->vs_cookie);
+	RW_EXIT(&portp->maccl_rwlock);
 	if (rv != 0) {
-		D2(vswp, "Alloc a share failed for ldc=0x%lx rv=%d",
-		    ldc_id, rv);
 		return (NULL);
 	}
 
-	/*
-	 * Query the RX group number to bind the port's
-	 * MAC address to it.
-	 */
-	hcapab->ms_squery(vsharep->vs_shdl, MAC_RING_TYPE_RX,
-	    &rmin, &rmax, &rmap, &vsharep->vs_gnum);
-
 	/* Cache some useful info */
 	vsharep->vs_ldcid = ldcp->ldc_id;
 	vsharep->vs_macaddr = vnet_macaddr_strtoul(
 	    portp->p_macaddr.ether_addr_octet);
 	vsharep->vs_portp = ldcp->ldc_port;
-
-	/* Bind the Guest's MAC address */
-	rv = vsw_hio_bind_macaddr(vsharep);
-	if (rv != 0) {
-		/* something went wrong, cleanup */
-		hcapab->ms_sfree(vsharep->vs_shdl);
-		return (NULL);
-	}
-
 	vsharep->vs_state |= VSW_SHARE_ASSIGNED;
 
 	D1(vswp, "%s:exit\n", __func__);
@@ -242,61 +221,6 @@ vsw_hio_alloc_share(vsw_t *vswp, vsw_ldc_t *ldcp)
 }
 
 /*
- * vsw_hio_bind_macaddr -- Remove the port's MAC address from the
- *	physdev and bind it to the Share's RX group.
- */
-static int
-vsw_hio_bind_macaddr(vsw_share_t *vsharep)
-{
-	vsw_t		*vswp = vsharep->vs_vswp;
-	vsw_port_t	*portp = vsharep->vs_portp;
-	mac_capab_rings_t *rcapab = &vswp->vhio.vh_rcapab;
-	mac_group_info_t *ginfop = &vsharep->vs_rxginfo;
-	int		rv;
-
-	/* Get the RX groupinfo */
-	rcapab->mr_gget(rcapab->mr_handle, MAC_RING_TYPE_RX,
-	    vsharep->vs_gnum, &vsharep->vs_rxginfo, NULL);
-
-	/* Unset the MAC address first */
-	if (portp->addr_set != VSW_ADDR_UNSET) {
-		(void) vsw_unset_hw(vswp, portp, VSW_VNETPORT);
-	}
-
-	/* Bind the MAC address to the RX group */
-	rv = ginfop->mrg_addmac(ginfop->mrg_driver,
-	    (uint8_t *)&portp->p_macaddr.ether_addr_octet);
-	if (rv != 0) {
-		/* Restore the address back as it was */
-		(void) vsw_set_hw(vswp, portp, VSW_VNETPORT);
-		return (rv);
-	}
-	return (0);
-}
-
-/*
- * vsw_hio_unbind_macaddr -- Unbind the port's MAC address and restore
- *	it back as it was before.
- */
-static void
-vsw_hio_unbind_macaddr(vsw_share_t *vsharep)
-{
-	vsw_t		*vswp = vsharep->vs_vswp;
-	vsw_port_t	*portp = vsharep->vs_portp;
-	mac_group_info_t *ginfop = &vsharep->vs_rxginfo;
-
-	if (portp == NULL) {
-		return;
-	}
-	/* Unbind the MAC address from the RX group */
-	(void) ginfop->mrg_remmac(ginfop->mrg_driver,
-	    (uint8_t *)&portp->p_macaddr.ether_addr_octet);
-
-	/* Program the MAC address back */
-	(void) vsw_set_hw(vswp, portp, VSW_VNETPORT);
-}
-
-/*
  * vsw_hio_find_free_share -- Find a free Share.
  */
 static vsw_share_t *
@@ -380,16 +304,13 @@ static void
 vsw_hio_free_share(vsw_share_t *vsharep)
 {
 	vsw_t		*vswp = vsharep->vs_vswp;
-	vsw_hio_t	*hiop = &vswp->vhio;
-	mac_capab_share_t *hcapab = &hiop->vh_scapab;
+	vsw_port_t	*portp = vsharep->vs_portp;
 
 	D1(vswp, "%s:enter\n", __func__);
 
-	/* First unbind the MAC address and restore it back */
-	vsw_hio_unbind_macaddr(vsharep);
-
-	/* free share */
-	hcapab->ms_sfree(vsharep->vs_shdl);
+	WRITE_ENTER(&portp->maccl_rwlock);
+	mac_share_unbind(portp->p_mch);
+	RW_EXIT(&portp->maccl_rwlock);
 	vsharep->vs_state = VSW_SHARE_FREE;
 	vsharep->vs_macaddr = 0;
 
@@ -455,7 +376,7 @@ vsw_hio_free_all_shares(vsw_t *vswp, boolean_t reboot)
 	 * HybridIO.
 	 */
 	READ_ENTER(&plist->lockrw);
-	mutex_enter(&vswp->hw_lock);
+	mutex_enter(&vswp->mac_lock);
 	/*
 	 * first clear the hio_capable flag so that no more
 	 * HybridIO operations are initiated.
@@ -515,9 +436,9 @@ vsw_hio_free_all_shares(vsw_t *vswp, boolean_t reboot)
 		 * This delay is also needed for the port reset to
 		 * release the Hybrid resource.
 		 */
-		mutex_exit(&vswp->hw_lock);
+		mutex_exit(&vswp->mac_lock);
 		drv_usecwait(vsw_hio_cleanup_delay);
-		mutex_enter(&vswp->hw_lock);
+		mutex_enter(&vswp->mac_lock);
 		max_retries--;
 	} while ((free_shares < hiop->vh_num_shares) && (max_retries > 0));
 
@@ -532,7 +453,7 @@ vsw_hio_free_all_shares(vsw_t *vswp, boolean_t reboot)
 	kmem_free(hiop->vh_shares, sizeof (vsw_share_t) * hiop->vh_num_shares);
 	hiop->vh_shares = NULL;
 	hiop->vh_num_shares = 0;
-	mutex_exit(&vswp->hw_lock);
+	mutex_exit(&vswp->mac_lock);
 	RW_EXIT(&plist->lockrw);
 	D1(vswp, "%s:exit\n", __func__);
 }
@@ -560,12 +481,12 @@ vsw_hio_start_ports(vsw_t *vswp)
 		}
 
 		reset = B_FALSE;
-		mutex_enter(&vswp->hw_lock);
+		mutex_enter(&vswp->mac_lock);
 		vsharep = vsw_hio_find_vshare_port(vswp, portp);
 		if (vsharep == NULL) {
 			reset = B_TRUE;
 		}
-		mutex_exit(&vswp->hw_lock);
+		mutex_exit(&vswp->mac_lock);
 
 		if (reset == B_TRUE) {
 			/* Cause a rest to trigger HybridIO setup */
@@ -586,9 +507,9 @@ vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp)
 	int		rv;
 
 	D1(vswp, "%s:enter ldc=0x%lx", __func__, ldcp->ldc_id);
-	mutex_enter(&vswp->hw_lock);
+	mutex_enter(&vswp->mac_lock);
 	if (vswp->hio_capable == B_FALSE) {
-		mutex_exit(&vswp->hw_lock);
+		mutex_exit(&vswp->mac_lock);
 		D2(vswp, "%s:not HIO capable", __func__);
 		return;
 	}
@@ -596,14 +517,14 @@ vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp)
 	/* Verify if a share was already allocated */
 	vsharep = vsw_hio_find_vshare_ldcid(vswp, ldcp->ldc_id);
 	if (vsharep != NULL) {
-		mutex_exit(&vswp->hw_lock);
+		mutex_exit(&vswp->mac_lock);
 		D2(vswp, "%s:Share already allocated to ldc=0x%lx",
 		    __func__, ldcp->ldc_id);
 		return;
 	}
 	vsharep = vsw_hio_alloc_share(vswp, ldcp);
 	if (vsharep == NULL) {
-		mutex_exit(&vswp->hw_lock);
+		mutex_exit(&vswp->mac_lock);
 		D2(vswp, "%s: no Share available for ldc=0x%lx",
 		    __func__, ldcp->ldc_id);
 		return;
@@ -616,12 +537,12 @@ vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp)
 		 * Failed to send a DDS message, so cleanup now.
 		 */
 		vsw_hio_free_share(vsharep);
-		mutex_exit(&vswp->hw_lock);
+		mutex_exit(&vswp->mac_lock);
 		return;
 	}
 	vsharep->vs_state &= ~VSW_SHARE_DDS_ACKD;
 	vsharep->vs_state |= VSW_SHARE_DDS_SENT;
-	mutex_exit(&vswp->hw_lock);
+	mutex_exit(&vswp->mac_lock);
 
 	/* DERR only to print by default */
 	DERR(vswp, "Share allocated for ldc_id=0x%lx Cookie=0x%lX",
@@ -640,16 +561,16 @@ vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp)
 
 	D1(vswp, "%s:enter ldc=0x%lx", __func__, ldcp->ldc_id);
 
-	mutex_enter(&vswp->hw_lock);
+	mutex_enter(&vswp->mac_lock);
 	vsharep = vsw_hio_find_vshare_ldcid(vswp, ldcp->ldc_id);
 	if (vsharep == NULL) {
 		D1(vswp, "%s:no share found for ldc=0x%lx",
 		    __func__, ldcp->ldc_id);
-		mutex_exit(&vswp->hw_lock);
+		mutex_exit(&vswp->mac_lock);
 		return;
 	}
 	vsw_hio_free_share(vsharep);
-	mutex_exit(&vswp->hw_lock);
+	mutex_exit(&vswp->mac_lock);
 
 	D1(vswp, "%s:exit ldc=0x%lx", __func__, ldcp->ldc_id);
 }
@@ -669,12 +590,12 @@ vsw_hio_send_delshare_msg(vsw_share_t *vsharep)
 	uint64_t	macaddr = vsharep->vs_macaddr;
 	int		rv;
 
-	ASSERT(MUTEX_HELD(&vswp->hw_lock));
-	mutex_exit(&vswp->hw_lock);
+	ASSERT(MUTEX_HELD(&vswp->mac_lock));
+	mutex_exit(&vswp->mac_lock);
 
 	portp = vsharep->vs_portp;
 	if (portp == NULL) {
-		mutex_enter(&vswp->hw_lock);
+		mutex_enter(&vswp->mac_lock);
 		return (0);
 	}
 
@@ -683,7 +604,7 @@ vsw_hio_send_delshare_msg(vsw_share_t *vsharep)
 	ldcp = ldcl->head;
 	if ((ldcp == NULL) || (ldcp->ldc_id != vsharep->vs_ldcid)) {
 		RW_EXIT(&ldcl->lockrw);
-		mutex_enter(&vswp->hw_lock);
+		mutex_enter(&vswp->mac_lock);
 		return (0);
 	}
 	req_id = VSW_DDS_NEXT_REQID(vsharep);
@@ -691,7 +612,7 @@ vsw_hio_send_delshare_msg(vsw_share_t *vsharep)
 	    cookie, macaddr, req_id);
 
 	RW_EXIT(&ldcl->lockrw);
-	mutex_enter(&vswp->hw_lock);
+	mutex_enter(&vswp->mac_lock);
 	if (rv == 0) {
 		vsharep->vs_state &= ~VSW_SHARE_DDS_ACKD;
 		vsharep->vs_state |= VSW_SHARE_DDS_SENT;
@@ -740,14 +661,14 @@ vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg)
 		/* discard */
 		return;
 	}
-	mutex_enter(&vswp->hw_lock);
+	mutex_enter(&vswp->mac_lock);
 	/*
 	 * We expect to receive DDS messages only from guests that
 	 * have HybridIO started.
 	 */
 	vsharep = vsw_hio_find_vshare_ldcid(vswp, ldcp->ldc_id);
 	if (vsharep == NULL) {
-		mutex_exit(&vswp->hw_lock);
+		mutex_exit(&vswp->mac_lock);
 		return;
 	}
 
@@ -816,7 +737,7 @@ vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg)
 		    __func__, dmsg->dds_subclass);
 		break;
 	}
-	mutex_exit(&vswp->hw_lock);
+	mutex_exit(&vswp->mac_lock);
 	D1(vswp, "%s:exit ldc=0x%lx\n", __func__, ldcp->ldc_id);
 }
 
@@ -857,8 +778,12 @@ vsw_hio_port_update(vsw_port_t *portp, boolean_t hio_enabled)
 		/* Hybrid Mode is disabled, so stop HybridIO */
 		vsw_hio_stop_port(portp);
 		portp->p_hio_enabled = B_FALSE;
+
+		vsw_port_mac_reconfig(portp, B_FALSE, 0, NULL, 0);
 	} else {
 		portp->p_hio_enabled =  B_TRUE;
+		vsw_port_mac_reconfig(portp, B_FALSE, 0, NULL, 0);
+
 		/* reset the port to initiate HybridIO setup */
 		vsw_hio_port_reset(portp, B_FALSE);
 	}
@@ -877,16 +802,16 @@ vsw_hio_stop_port(vsw_port_t *portp)
 	int max_retries = vsw_hio_max_cleanup_retries;
 
 	D1(vswp, "%s:enter\n", __func__);
-	mutex_enter(&vswp->hw_lock);
+	mutex_enter(&vswp->mac_lock);
 
 	if (vswp->hio_capable == B_FALSE) {
-		mutex_exit(&vswp->hw_lock);
+		mutex_exit(&vswp->mac_lock);
 		return;
 	}
 
 	vsharep = vsw_hio_find_vshare_port(vswp, portp);
 	if (vsharep == NULL) {
-		mutex_exit(&vswp->hw_lock);
+		mutex_exit(&vswp->mac_lock);
 		return;
 	}
 
@@ -925,9 +850,9 @@ vsw_hio_stop_port(vsw_port_t *portp)
 		 * messages come and get processed, that is, shares
 		 * get freed.
 		 */
-		mutex_exit(&vswp->hw_lock);
+		mutex_exit(&vswp->mac_lock);
 		drv_usecwait(vsw_hio_cleanup_delay);
-		mutex_enter(&vswp->hw_lock);
+		mutex_enter(&vswp->mac_lock);
 
 		/* Check if the share still assigned to this port */
 		if ((vsharep->vs_portp != portp) ||
@@ -937,7 +862,7 @@ vsw_hio_stop_port(vsw_port_t *portp)
 		max_retries--;
 	} while ((vsharep->vs_state != VSW_SHARE_FREE) && (max_retries > 0));
 
-	mutex_exit(&vswp->hw_lock);
+	mutex_exit(&vswp->mac_lock);
 	D1(vswp, "%s:exit\n", __func__);
 }
 
@@ -1111,7 +1036,7 @@ vsw_hio_kstats_update(kstat_t *ksp, int rw)
 			return (0);
 		}
 
-		mutex_enter(&vswp->hw_lock);
+		mutex_enter(&vswp->mac_lock);
 		hiokp->hio_num_shares.value.ul = (uint32_t)hiop->vh_num_shares;
 		for (i = 0; i < hiop->vh_num_shares; i++) {
 			hiokp->share[i].assigned.value.ul =
@@ -1119,7 +1044,7 @@ vsw_hio_kstats_update(kstat_t *ksp, int rw)
 			hiokp->share[i].state.value.ul =
 			    hiop->vh_shares[i].vs_state;
 		}
-		mutex_exit(&vswp->hw_lock);
+		mutex_exit(&vswp->mac_lock);
 	} else {
 		return (EACCES);
 	}
diff --git a/usr/src/uts/sun4v/io/vsw_ldc.c b/usr/src/uts/sun4v/io/vsw_ldc.c
index e2273596a1..bfd6dde2fb 100644
--- a/usr/src/uts/sun4v/io/vsw_ldc.c
+++ b/usr/src/uts/sun4v/io/vsw_ldc.c
@@ -58,7 +58,6 @@
 #include <sys/taskq.h>
 #include <sys/note.h>
 #include <sys/mach_descrip.h>
-#include <sys/mac.h>
 #include <sys/mdeg.h>
 #include <sys/ldc.h>
 #include <sys/vsw_fdb.h>
@@ -88,7 +87,7 @@ int vsw_detach_ports(vsw_t *vswp);
 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
 int vsw_port_detach(vsw_t *vswp, int p_instance);
-int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count);
+int vsw_portsend(vsw_port_t *port, mblk_t *mp);
 int vsw_port_attach(vsw_port_t *portp);
 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
@@ -165,7 +164,6 @@ static void vsw_stop_rx_thread(vsw_ldc_t *ldcp);
 static void vsw_ldc_rx_worker(void *arg);
 
 /* Misc support routines */
-static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
 static void vsw_free_ring(dring_info_t *);
 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
@@ -183,8 +181,7 @@ static void display_ring(dring_info_t *);
  * Functions imported from other files.
  */
 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
-extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
-extern void vsw_reconfig_hw(vsw_t *);
+extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
 extern void vsw_del_mcst_port(vsw_port_t *port);
 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
@@ -205,7 +202,10 @@ extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
 extern void vsw_hio_stop_port(vsw_port_t *portp);
-extern void vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr);
+extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
+extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
+extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
+
 
 #define	VSW_NUM_VMPOOLS		3	/* number of vio mblk pools */
 
@@ -309,6 +309,7 @@ vsw_port_attach(vsw_port_t *port)
 	int			i;
 	int			nids = port->num_ldcs;
 	uint64_t		*ldcids;
+	int			rv;
 
 	D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
 
@@ -328,6 +329,7 @@ vsw_port_attach(vsw_port_t *port)
 
 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
+	rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
 
 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
@@ -339,29 +341,20 @@ vsw_port_attach(vsw_port_t *port)
 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
 			DERR(vswp, "%s: ldc_attach failed", __func__);
-
-			rw_destroy(&port->p_ldclist.lockrw);
-
-			cv_destroy(&port->state_cv);
-			mutex_destroy(&port->state_lock);
-
-			mutex_destroy(&port->tx_lock);
-			mutex_destroy(&port->mca_lock);
-			kmem_free(port, sizeof (vsw_port_t));
-			return (1);
+			goto exit_error;
 		}
 	}
 
 	if (vswp->switching_setup_done == B_TRUE) {
 		/*
-		 * If the underlying physical device has been setup,
-		 * program the mac address of this port in it.
-		 * Otherwise, port macaddr will be set after the physical
-		 * device is successfully setup by the timeout handler.
+		 * If the underlying network device has been setup,
+		 * then open a mac client and porgram the mac address
+		 * for this port.
 		 */
-		mutex_enter(&vswp->hw_lock);
-		(void) vsw_set_hw(vswp, port, VSW_VNETPORT);
-		mutex_exit(&vswp->hw_lock);
+		rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
+		if (rv != 0) {
+			goto exit_error;
+		}
 	}
 
 	/* create the fdb entry for this port/mac address */
@@ -386,11 +379,23 @@ vsw_port_attach(vsw_port_t *port)
 
 	/* announce macaddr of vnet to the physical switch */
 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
-		vsw_publish_macaddr(vswp, (uint8_t *)&(port->p_macaddr));
+		vsw_publish_macaddr(vswp, port);
 	}
 
 	D1(vswp, "%s: exit", __func__);
 	return (0);
+
+exit_error:
+	rw_destroy(&port->p_ldclist.lockrw);
+
+	cv_destroy(&port->state_cv);
+	mutex_destroy(&port->state_lock);
+
+	rw_destroy(&port->maccl_rwlock);
+	mutex_destroy(&port->tx_lock);
+	mutex_destroy(&port->mca_lock);
+	kmem_free(port, sizeof (vsw_port_t));
+	return (1);
 }
 
 /*
@@ -427,6 +432,9 @@ vsw_port_detach(vsw_t *vswp, int p_instance)
 	 */
 	RW_EXIT(&plist->lockrw);
 
+	/* Cleanup and close the mac client */
+	vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
+
 	/* Remove the fdb entry for this port/mac address */
 	vsw_fdbe_del(vswp, &(port->p_macaddr));
 	vsw_destroy_vlans(port, VSW_VNETPORT);
@@ -434,23 +442,6 @@ vsw_port_detach(vsw_t *vswp, int p_instance)
 	/* Remove any multicast addresses.. */
 	vsw_del_mcst_port(port);
 
-	/* Remove address if was programmed into HW. */
-	mutex_enter(&vswp->hw_lock);
-
-	/*
-	 * Port's address may not have been set in hardware. This could
-	 * happen if the underlying physical device is not yet available and
-	 * vsw_setup_switching_timeout() may be in progress.
-	 * We remove its addr from hardware only if it has been set before.
-	 */
-	if (port->addr_set != VSW_ADDR_UNSET)
-		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
-
-	if (vswp->recfg_reqd)
-		vsw_reconfig_hw(vswp);
-
-	mutex_exit(&vswp->hw_lock);
-
 	if (vsw_port_delete(port)) {
 		return (1);
 	}
@@ -482,10 +473,8 @@ vsw_detach_ports(vsw_t *vswp)
 			return (1);
 		}
 
-		/* Remove address if was programmed into HW. */
-		mutex_enter(&vswp->hw_lock);
-		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
-		mutex_exit(&vswp->hw_lock);
+		/* Cleanup and close the mac client */
+		vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
 
 		/* Remove the fdb entry for this port/mac address */
 		vsw_fdbe_del(vswp, &(port->p_macaddr));
@@ -560,6 +549,7 @@ vsw_port_delete(vsw_port_t *port)
 
 	rw_destroy(&port->p_ldclist.lockrw);
 
+	rw_destroy(&port->maccl_rwlock);
 	mutex_destroy(&port->mca_lock);
 	mutex_destroy(&port->tx_lock);
 
@@ -570,6 +560,11 @@ vsw_port_delete(vsw_port_t *port)
 		kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
 		port->num_ldcs = 0;
 	}
+
+	if (port->nvids != 0) {
+		kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
+	}
+
 	kmem_free(port, sizeof (vsw_port_t));
 
 	D1(vswp, "%s: exit", __func__);
@@ -4205,12 +4200,13 @@ vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
 
 /* transmit the packet over the given port */
 int
-vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count)
+vsw_portsend(vsw_port_t *port, mblk_t *mp)
 {
 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
 	vsw_ldc_t 	*ldcp;
+	mblk_t		*mpt;
+	int		count;
 	int		status = 0;
-	uint32_t	n;
 
 	READ_ENTER(&ldcl->lockrw);
 	/*
@@ -4224,18 +4220,13 @@ vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count)
 		return (1);
 	}
 
-	n = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
+	count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
 
-	count -= n;
-	if (count == 0) {
-		goto vsw_portsend_exit;
+	if (count != 0) {
+		status = ldcp->tx(ldcp, mp, mpt, count);
 	}
 
-	status = ldcp->tx(ldcp, mp, mpt, count);
-
-vsw_portsend_exit:
 	RW_EXIT(&ldcl->lockrw);
-
 	return (status);
 }
 
@@ -5735,14 +5726,6 @@ vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
 
 }
 
-static caddr_t
-vsw_print_ethaddr(uint8_t *a, char *ebuf)
-{
-	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
-	    a[0], a[1], a[2], a[3], a[4], a[5]);
-	return (ebuf);
-}
-
 /*
  * Reset and free all the resources associated with
  * the channel.
diff --git a/usr/src/uts/sun4v/io/vsw_phys.c b/usr/src/uts/sun4v/io/vsw_phys.c
index 962ccc1cb9..127e1635c1 100644
--- a/usr/src/uts/sun4v/io/vsw_phys.c
+++ b/usr/src/uts/sun4v/io/vsw_phys.c
@@ -55,7 +55,8 @@
 #include <sys/machsystm.h>
 #include <sys/modctl.h>
 #include <sys/modhash.h>
-#include <sys/mac.h>
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/taskq.h>
 #include <sys/note.h>
@@ -63,134 +64,133 @@
 #include <sys/mac.h>
 #include <sys/mdeg.h>
 #include <sys/vsw.h>
+#include <sys/vlan.h>
 
 /* MAC Ring table functions. */
-static void vsw_mac_ring_tbl_init(vsw_t *vswp);
-static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
-static void vsw_queue_worker(vsw_mac_ring_t *rrp);
-static void vsw_queue_stop(vsw_queue_t *vqp);
-static vsw_queue_t *vsw_queue_create();
-static void vsw_queue_destroy(vsw_queue_t *vqp);
-static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
-static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
+static void vsw_port_rx_cb(void *, mac_resource_handle_t, mblk_t *,
+    boolean_t);
+static void vsw_if_rx_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t);
 
 /* MAC layer routines */
-static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
-		mac_resource_t *mrp);
-static	int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *);
-static	int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int);
-static	int vsw_unset_hw_addr(vsw_t *, int);
-static	int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int);
-static int vsw_prog_if(vsw_t *);
+static int vsw_set_port_hw_addr(vsw_port_t *port);
+static int vsw_set_if_hw_addr(vsw_t *vswp);
+static	void vsw_unset_hw_addr(vsw_t *, vsw_port_t *, int);
+static int vsw_maccl_open(vsw_t *vswp, vsw_port_t *port, int type);
+static void vsw_maccl_close(vsw_t *vswp, vsw_port_t *port, int type);
+static void vsw_mac_multicast_add_all(vsw_t *vswp, vsw_port_t *portp, int type);
+static void vsw_mac_multicast_remove_all(vsw_t *vswp,
+    vsw_port_t *portp, int type);
+static void vsw_mac_add_vlans(vsw_t *vswp, mac_client_handle_t mch,
+    uint8_t *macaddr, uint16_t flags, vsw_vlanid_t *vids, int nvids);
+static void vsw_mac_remove_vlans(mac_client_handle_t mch, vsw_vlanid_t *vids,
+    int nvids);
 static	void vsw_mac_set_mtu(vsw_t *vswp, uint32_t mtu);
 
 /* Support functions */
-static int vsw_prog_ports(vsw_t *);
 int vsw_set_hw(vsw_t *, vsw_port_t *, int);
-int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
+void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
 void vsw_reconfig_hw(vsw_t *);
-int vsw_mac_attach(vsw_t *vswp);
-void vsw_mac_detach(vsw_t *vswp);
 int vsw_mac_open(vsw_t *vswp);
 void vsw_mac_close(vsw_t *vswp);
+int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port, mcst_addr_t *mcst_p,
+    int type);
+void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
+    mcst_addr_t *mcst_p, int type);
+int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
+void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
+void vsw_mac_cleanup_ports(vsw_t *vswp);
 void vsw_unset_addrs(vsw_t *vswp);
 void vsw_set_addrs(vsw_t *vswp);
-int vsw_get_hw_maddr(vsw_t *);
-mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
-void vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr);
+mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
+void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
+void vsw_port_mac_reconfig(vsw_port_t *portp, boolean_t update_vlans,
+    uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids);
+void vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid,
+    vsw_vlanid_t *new_vids, int new_nvids);
+void vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans,
+    uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids);
 
+/*
+ * Functions imported from other files.
+ */
+extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
+extern void vsw_hio_stop_port(vsw_port_t *portp);
+extern void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
+extern uint32_t vsw_publish_macaddr_count;
+extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
+	mblk_t **npt);
 static char mac_mtu_propname[] = "mtu";
 
 /*
  * Tunables used in this file.
  */
 extern int vsw_mac_open_retries;
-extern boolean_t vsw_multi_ring_enable;
-extern int vsw_mac_rx_rings;
-extern uint32_t vsw_publish_macaddr_count;
 
-/*
- * Check to see if the card supports the setting of multiple unicst
- * addresses.
- *
- * Returns 0 if card supports the programming of multiple unicast addresses,
- * otherwise returns 1.
- */
-int
-vsw_get_hw_maddr(vsw_t *vswp)
-{
-	D1(vswp, "%s: enter", __func__);
 
-	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
+#define	WRITE_MACCL_ENTER(vswp, port, type)	\
+	(type == VSW_LOCALDEV) ?  rw_enter(&vswp->maccl_rwlock, RW_WRITER) :\
+	rw_enter(&port->maccl_rwlock, RW_WRITER)
 
-	if (vswp->mh == NULL)
-		return (1);
+#define	READ_MACCL_ENTER(vswp, port, type)	\
+	(type == VSW_LOCALDEV) ?  rw_enter(&vswp->maccl_rwlock, RW_READER) :\
+	rw_enter(&port->maccl_rwlock, RW_READER)
 
-	if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
-		cmn_err(CE_NOTE, "!vsw%d: device (%s) does not support "
-		    "programming multiple addresses", vswp->instance,
-		    vswp->physname);
-		return (1);
-	}
+#define	RW_MACCL_EXIT(vswp, port, type)	\
+	(type == VSW_LOCALDEV) ?  rw_exit(&vswp->maccl_rwlock) :	\
+	rw_exit(&port->maccl_rwlock)
 
-	D2(vswp, "%s: %d addrs : %d free", __func__,
-	    vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
 
-	D1(vswp, "%s: exit", __func__);
+/*
+ * Locking strategy in this file is explained as follows:
+ *	 - A global lock(vswp->mac_lock) is used to protect the
+ *	   MAC calls that deal with entire device. That is, the
+ *	   operations that deal with mac_handle which include
+ *	   mac_open()/close() and mac_client_open().
+ *
+ *	- A per port/interface RW lock(maccl_rwlock) is used protect
+ *	  the operations that deal with the MAC client.
+ *
+ *	When both mac_lock and maccl_rwlock need to be held, the
+ *	mac_lock need be acquired first and then maccl_rwlock. That is,
+ *		mac_lock---->maccl_rwlock
+ *
+ *	The 'mca_lock' that protects the mcast list is also acquired
+ *	within the context of maccl_rwlock. The hierarchy for this
+ *	one is as below:
+ *		maccl_rwlock---->mca_lock
+ */
 
-	return (0);
-}
 
 /*
  * Program unicast and multicast addresses of vsw interface and the ports
- * into the physical device.
+ * into the network device.
  */
 void
 vsw_set_addrs(vsw_t *vswp)
 {
 	vsw_port_list_t	*plist = &vswp->plist;
 	vsw_port_t	*port;
-	mcst_addr_t	*mcap;
 	int		rv;
 
 	READ_ENTER(&vswp->if_lockrw);
 
 	if (vswp->if_state & VSW_IF_UP) {
 
-		/* program unicst addr of vsw interface in the physdev */
-		if (vswp->addr_set == VSW_ADDR_UNSET) {
-			mutex_enter(&vswp->hw_lock);
-			rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
-			mutex_exit(&vswp->hw_lock);
-			if (rv != 0) {
-				cmn_err(CE_NOTE,
-				    "!vsw%d: failed to program interface "
-				    "unicast address\n", vswp->instance);
-			}
-			/*
-			 * Notify the MAC layer of the changed address.
-			 */
-			mac_unicst_update(vswp->if_mh,
-			    (uint8_t *)&vswp->if_addr);
+		/* Open a mac client and program addresses */
+		rv = vsw_mac_client_init(vswp, NULL, VSW_LOCALDEV);
+		if (rv != 0) {
+			cmn_err(CE_NOTE,
+			    "!vsw%d: failed to program interface "
+			    "unicast address\n", vswp->instance);
 		}
 
-		/* program mcast addrs of vsw interface in the physdev */
-		mutex_enter(&vswp->mca_lock);
-		WRITE_ENTER(&vswp->mac_rwlock);
-		for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
-			if (mcap->mac_added)
-				continue;
-			rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
-			if (rv == 0) {
-				mcap->mac_added = B_TRUE;
-			} else {
-				cmn_err(CE_NOTE, "!vsw%d: unable to add "
-				    "multicast address: %s\n", vswp->instance,
-				    ether_sprintf((void *)&mcap->mca));
-			}
+		/*
+		 * Notify the MAC layer of the changed address.
+		 */
+		if (rv == 0) {
+			mac_unicst_update(vswp->if_mh,
+			    (uint8_t *)&vswp->if_addr);
 		}
-		RW_EXIT(&vswp->mac_rwlock);
-		mutex_exit(&vswp->mca_lock);
 
 	}
 
@@ -198,43 +198,24 @@ vsw_set_addrs(vsw_t *vswp)
 
 	WRITE_ENTER(&plist->lockrw);
 
-	/* program unicast address of ports in the physical device */
-	mutex_enter(&vswp->hw_lock);
+	/* program unicast address of ports in the network device */
 	for (port = plist->head; port != NULL; port = port->p_next) {
-		if (port->addr_set != VSW_ADDR_UNSET) /* addr already set */
+		if (port->addr_set) /* addr already set */
 			continue;
-		if (vsw_set_hw(vswp, port, VSW_VNETPORT)) {
-			cmn_err(CE_NOTE,
-			    "!vsw%d: port:%d failed to set unicast address\n",
-			    vswp->instance, port->p_instance);
-		}
-	}
-	mutex_exit(&vswp->hw_lock);
 
-	/* program multicast addresses of ports in the physdev */
-	for (port = plist->head; port != NULL; port = port->p_next) {
-		mutex_enter(&port->mca_lock);
-		WRITE_ENTER(&vswp->mac_rwlock);
-		for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
-			if (mcap->mac_added)
-				continue;
-			rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
-			if (rv == 0) {
-				mcap->mac_added = B_TRUE;
-			} else {
-				cmn_err(CE_NOTE, "!vsw%d: unable to add "
-				    "multicast address: %s\n", vswp->instance,
-				    ether_sprintf((void *)&mcap->mca));
-			}
+		/* Open a mac client and program addresses */
+		rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
+		if (rv != 0) {
+			cmn_err(CE_NOTE,
+			    "!vsw%d: failed to program port(%d) "
+			    "unicast address\n", vswp->instance,
+			    port->p_instance);
 		}
-		RW_EXIT(&vswp->mac_rwlock);
-		mutex_exit(&port->mca_lock);
 	}
-
 	/* announce macaddr of vnets to the physical switch */
 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
 		for (port = plist->head; port != NULL; port = port->p_next) {
-			vsw_publish_macaddr(vswp, (uint8_t *)&port->p_macaddr);
+			vsw_publish_macaddr(vswp, port);
 		}
 	}
 
@@ -242,93 +223,37 @@ vsw_set_addrs(vsw_t *vswp)
 }
 
 /*
- * Remove unicast and multicast addresses of vsw interface and the ports
- * from the physical device.
+ * Remove unicast, multicast addresses and close mac clients
+ * for the vsw interface and all ports.
  */
 void
 vsw_unset_addrs(vsw_t *vswp)
 {
-	vsw_port_list_t	*plist = &vswp->plist;
-	vsw_port_t	*port;
-	mcst_addr_t	*mcap;
-
 	READ_ENTER(&vswp->if_lockrw);
-
 	if (vswp->if_state & VSW_IF_UP) {
 
-		/*
-		 * Remove unicast addr of vsw interfce
-		 * from current physdev
-		 */
-		mutex_enter(&vswp->hw_lock);
-		(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
-		mutex_exit(&vswp->hw_lock);
-
-		/*
-		 * Remove mcast addrs of vsw interface
-		 * from current physdev
-		 */
-		mutex_enter(&vswp->mca_lock);
-		WRITE_ENTER(&vswp->mac_rwlock);
-		for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
-			if (!mcap->mac_added)
-				continue;
-			(void) mac_multicst_remove(vswp->mh,
-			    (uchar_t *)&mcap->mca);
-			mcap->mac_added = B_FALSE;
-		}
-		RW_EXIT(&vswp->mac_rwlock);
-		mutex_exit(&vswp->mca_lock);
-
+		/* Cleanup and close the mac client for the interface */
+		vsw_mac_client_cleanup(vswp, NULL, VSW_LOCALDEV);
 	}
-
 	RW_EXIT(&vswp->if_lockrw);
 
-	WRITE_ENTER(&plist->lockrw);
-
-	/*
-	 * Remove unicast address of ports from the current physical device
-	 */
-	mutex_enter(&vswp->hw_lock);
-	for (port = plist->head; port != NULL; port = port->p_next) {
-		/* Remove address if was programmed into HW. */
-		if (port->addr_set == VSW_ADDR_UNSET)
-			continue;
-		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
-	}
-	mutex_exit(&vswp->hw_lock);
-
-	/* Remove multicast addresses of ports from the current physdev */
-	for (port = plist->head; port != NULL; port = port->p_next) {
-		mutex_enter(&port->mca_lock);
-		WRITE_ENTER(&vswp->mac_rwlock);
-		for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
-			if (!mcap->mac_added)
-				continue;
-			(void) mac_multicst_remove(vswp->mh,
-			    (uchar_t *)&mcap->mca);
-			mcap->mac_added = B_FALSE;
-		}
-		RW_EXIT(&vswp->mac_rwlock);
-		mutex_exit(&port->mca_lock);
-	}
-
-	RW_EXIT(&plist->lockrw);
+	/* Cleanup and close the mac clients for all ports */
+	vsw_mac_cleanup_ports(vswp);
 }
 
 /*
- * Open the underlying physical device for access in layer2 mode.
+ * Open the underlying network device for access in layer2 mode.
  * Returns:
- * 0 on success
- * EAGAIN if mac_open() fails due to the device being not available yet.
- * EIO on any other failures.
+ *	0 on success
+ *	EAGAIN if mac_open() fails due to the device being not available yet.
+ *	EIO on any other failures.
  */
 int
 vsw_mac_open(vsw_t *vswp)
 {
-	int	rv;
+	int			rv;
 
-	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
+	ASSERT(MUTEX_HELD(&vswp->mac_lock));
 
 	if (vswp->mh != NULL) {
 		/* already open */
@@ -352,14 +277,15 @@ vsw_mac_open(vsw_t *vswp)
 		if (rv == ENOENT || rv == EBADF) {
 			return (EAGAIN);
 		} else {
-			cmn_err(CE_WARN, "vsw%d: device (%s) open failed rv:%x",
+			cmn_err(CE_WARN, "vsw%d: mac_open %s failed rv:%x",
 			    vswp->instance, vswp->physname, rv);
 			return (EIO);
 		}
 	}
-
 	vswp->mac_open_retries = 0;
 
+	vsw_mac_set_mtu(vswp, vswp->mtu);
+
 	return (0);
 }
 
@@ -369,1005 +295,852 @@ vsw_mac_open(vsw_t *vswp)
 void
 vsw_mac_close(vsw_t *vswp)
 {
-	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
+	ASSERT(MUTEX_HELD(&vswp->mac_lock));
 
 	if (vswp->mh != NULL) {
+		if (vswp->mtu != vswp->mtu_physdev_orig) {
+			vsw_mac_set_mtu(vswp, vswp->mtu_physdev_orig);
+		}
 		mac_close(vswp->mh);
 		vswp->mh = NULL;
 	}
 }
 
 /*
- * Link into the MAC layer to gain access to the services provided by
- * the underlying physical device driver (which should also have
- * registered with the MAC layer).
- *
- * Only when in layer 2 mode.
+ * Add multicast addr.
  */
 int
-vsw_mac_attach(vsw_t *vswp)
+vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port, mcst_addr_t *mcst_p,
+    int type)
 {
-	D1(vswp, "%s: enter", __func__);
-
-	ASSERT(vswp->mrh == NULL);
-	ASSERT(vswp->mstarted == B_FALSE);
-	ASSERT(vswp->mresources == B_FALSE);
-
-	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
-
-	ASSERT(vswp->mh != NULL);
-
-	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
-
-	vsw_mac_set_mtu(vswp, vswp->mtu);
-
-	if (vsw_multi_ring_enable) {
-		/*
-		 * Initialize the ring table.
-		 */
-		vsw_mac_ring_tbl_init(vswp);
-
-		/*
-		 * Register our rx callback function.
-		 */
-		vswp->mrh = mac_rx_add(vswp->mh,
-		    vsw_rx_queue_cb, (void *)vswp);
-		ASSERT(vswp->mrh != NULL);
-
-		/*
-		 * Register our mac resource callback.
-		 */
-		mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
-		vswp->mresources = B_TRUE;
-
-		/*
-		 * Get the ring resources available to us from
-		 * the mac below us.
-		 */
-		mac_resources(vswp->mh);
-	} else {
-		/*
-		 * Just register our rx callback function
-		 */
-		vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
-		ASSERT(vswp->mrh != NULL);
-	}
-
-	/* Get the MAC tx fn */
-	vswp->txinfo = mac_tx_get(vswp->mh);
-
-	/* start the interface */
-	if (mac_start(vswp->mh) != 0) {
-		cmn_err(CE_WARN, "!vsw%d: Could not start mac interface",
-		    vswp->instance);
-		goto mac_fail_exit;
+	int			ret = 0;
+	mac_client_handle_t	mch;
+
+	WRITE_MACCL_ENTER(vswp, port, type);
+
+	mch = (type == VSW_LOCALDEV) ? vswp->mch : port->p_mch;
+
+	if (mch != NULL) {
+		ret = mac_multicast_add(mch, mcst_p->mca.ether_addr_octet);
+		if (ret != 0) {
+			cmn_err(CE_WARN, "!vsw%d: unable to "
+			    "program multicast address(%s) err=%d",
+			    vswp->instance,
+			    ether_sprintf((void *)&mcst_p->mca), ret);
+			RW_MACCL_EXIT(vswp, port, type);
+			return (ret);
+		}
+		mcst_p->mac_added = B_TRUE;
 	}
 
-	vswp->mstarted = B_TRUE;
-
-	D1(vswp, "%s: exit", __func__);
-	return (0);
-
-mac_fail_exit:
-	vsw_mac_detach(vswp);
-
-	D1(vswp, "%s: exit", __func__);
-	return (1);
+	RW_MACCL_EXIT(vswp, port, type);
+	return (ret);
 }
 
+/*
+ * Remove multicast addr.
+ */
 void
-vsw_mac_detach(vsw_t *vswp)
+vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port, mcst_addr_t *mcst_p,
+    int type)
 {
-	D1(vswp, "vsw_mac_detach: enter");
-
-	ASSERT(vswp != NULL);
-	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
+	mac_client_handle_t	mch;
 
-	if (vsw_multi_ring_enable) {
-		vsw_mac_ring_tbl_destroy(vswp);
-	}
+	WRITE_MACCL_ENTER(vswp, port, type);
+	mch = (type == VSW_LOCALDEV) ? vswp->mch : port->p_mch;
 
-	if (vswp->mh != NULL) {
-		if (vswp->mstarted)
-			mac_stop(vswp->mh);
-		if (vswp->mrh != NULL)
-			mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE);
-		if (vswp->mresources)
-			mac_resource_set(vswp->mh, NULL, NULL);
-		if (vswp->mtu != vswp->mtu_physdev_orig) {
-			vsw_mac_set_mtu(vswp, vswp->mtu_physdev_orig);
-		}
+	if (mch != NULL && mcst_p->mac_added) {
+		mac_multicast_remove(mch, mcst_p->mca.ether_addr_octet);
+		mcst_p->mac_added = B_FALSE;
 	}
-
-	vswp->mrh = NULL;
-	vswp->txinfo = NULL;
-	vswp->mstarted = B_FALSE;
-
-	D1(vswp, "vsw_mac_detach: exit");
+	RW_MACCL_EXIT(vswp, port, type);
 }
 
+
 /*
- * Depending on the mode specified, the capabilites and capacity
- * of the underlying device setup the physical device.
- *
- * If in layer 3 mode, then do nothing.
- *
- * If in layer 2 programmed mode attempt to program the unicast address
- * associated with the port into the physical device. If this is not
- * possible due to resource exhaustion or simply because the device does
- * not support multiple unicast addresses then if required fallback onto
- * putting the card into promisc mode.
- *
- * If in promisc mode then simply set the card into promisc mode.
- *
- * Returns 0 success, 1 on failure.
+ * Add all multicast addresses of the port.
  */
-int
-vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
+static void
+vsw_mac_multicast_add_all(vsw_t *vswp, vsw_port_t *portp, int type)
 {
-	mac_multi_addr_t	mac_addr;
-	int			err;
+	mcst_addr_t		*mcap;
+	mac_client_handle_t	mch;
+	kmutex_t		*mca_lockp;
+	int			rv;
 
-	D1(vswp, "%s: enter", __func__);
-
-	ASSERT(MUTEX_HELD(&vswp->hw_lock));
 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
-
-	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
-		return (0);
-
-	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
-		return (vsw_set_hw_promisc(vswp, port, type));
-	}
-
-	/*
-	 * Attempt to program the unicast address into the HW.
-	 */
-	mac_addr.mma_addrlen = ETHERADDRL;
-	if (type == VSW_VNETPORT) {
-		ASSERT(port != NULL);
-		ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
+	if (type == VSW_LOCALDEV) {
+		ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
+		mch = vswp->mch;
+		mcap = vswp->mcap;
+		mca_lockp = &vswp->mca_lock;
 	} else {
-		ether_copy(&vswp->if_addr, &mac_addr.mma_addr);
+		ASSERT(RW_WRITE_HELD(&portp->maccl_rwlock));
+		mch = portp->p_mch;
+		mcap = portp->mcap;
+		mca_lockp = &portp->mca_lock;
 	}
 
-	err = vsw_set_hw_addr(vswp, &mac_addr);
-	if (err == ENOSPC) {
-		/*
-		 * Mark that attempt should be made to re-config sometime
-		 * in future if a port is deleted.
-		 */
-		vswp->recfg_reqd = B_TRUE;
-
-		/*
-		 * Only 1 mode specified, nothing more to do.
-		 */
-		if (vswp->smode_num == 1)
-			return (err);
+	if (mch == NULL)
+		return;
 
-		/*
-		 * If promiscuous was next mode specified try to
-		 * set the card into that mode.
-		 */
-		if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
-		    (vswp->smode[vswp->smode_idx + 1] ==
-		    VSW_LAYER2_PROMISC)) {
-			vswp->smode_idx += 1;
-			return (vsw_set_hw_promisc(vswp, port, type));
+	mutex_enter(mca_lockp);
+	for (mcap = mcap; mcap != NULL; mcap = mcap->nextp) {
+		if (mcap->mac_added)
+			continue;
+		rv = mac_multicast_add(mch, (uchar_t *)&mcap->mca);
+		if (rv == 0) {
+			mcap->mac_added = B_TRUE;
+		} else {
+			cmn_err(CE_WARN, "!vsw%d: unable to program "
+			    "multicast address(%s) err=%d", vswp->instance,
+			    ether_sprintf((void *)&mcap->mca), rv);
 		}
-		return (err);
 	}
+	mutex_exit(mca_lockp);
+}
 
-	if (err != 0)
-		return (err);
+/*
+ * Remove all multicast addresses of the port.
+ */
+static void
+vsw_mac_multicast_remove_all(vsw_t *vswp, vsw_port_t *portp, int type)
+{
+	mac_client_handle_t	mch;
+	mcst_addr_t		*mcap;
+	kmutex_t		*mca_lockp;
 
-	if (type == VSW_VNETPORT) {
-		port->addr_slot = mac_addr.mma_slot;
-		port->addr_set = VSW_ADDR_HW;
+	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
+	if (type == VSW_LOCALDEV) {
+		ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
+		mch = vswp->mch;
+		mcap = vswp->mcap;
+		mca_lockp = &vswp->mca_lock;
 	} else {
-		vswp->addr_slot = mac_addr.mma_slot;
-		vswp->addr_set = VSW_ADDR_HW;
+		ASSERT(RW_WRITE_HELD(&portp->maccl_rwlock));
+		mch = portp->p_mch;
+		mcap = portp->mcap;
+		mca_lockp = &portp->mca_lock;
 	}
 
-	D2(vswp, "programmed addr %s into slot %d "
-	"of device %s", ether_sprintf((void *)mac_addr.mma_addr),
-	    mac_addr.mma_slot, vswp->physname);
-
-	D1(vswp, "%s: exit", __func__);
+	if (mch == NULL)
+		return;
 
-	return (0);
+	mutex_enter(mca_lockp);
+	for (; mcap != NULL; mcap = mcap->nextp) {
+		if (!mcap->mac_added)
+			continue;
+		(void) mac_multicast_remove(mch, (uchar_t *)&mcap->mca);
+		mcap->mac_added = B_FALSE;
+	}
+	mutex_exit(mca_lockp);
 }
 
 /*
- * If in layer 3 mode do nothing.
- *
- * If in layer 2 switched mode remove the address from the physical
- * device.
- *
- * If in layer 2 promiscuous mode disable promisc mode.
- *
- * Returns 0 on success.
+ * Open a mac client and program uncast and multicast addresses
+ * for a port or the interface.
+ * Returns:
+ *	0 on success
+ *	non-zero for failure.
  */
 int
-vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
+vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type)
 {
-	mac_addr_slot_t	slot;
-	int		rv;
-
-	D1(vswp, "%s: enter", __func__);
-
-	ASSERT(MUTEX_HELD(&vswp->hw_lock));
-
-	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
-		return (0);
+	int rv;
 
-	switch (type) {
-	case VSW_VNETPORT:
-		ASSERT(port != NULL);
-
-		if (port->addr_set == VSW_ADDR_PROMISC) {
-			return (vsw_unset_hw_promisc(vswp, port, type));
-
-		} else if (port->addr_set == VSW_ADDR_HW) {
-			slot = port->addr_slot;
-			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
-				port->addr_set = VSW_ADDR_UNSET;
-		}
+	mutex_enter(&vswp->mac_lock);
+	WRITE_MACCL_ENTER(vswp, port, type);
+	rv = vsw_maccl_open(vswp, port, type);
 
-		break;
+	/* Release mac_lock now */
+	mutex_exit(&vswp->mac_lock);
 
-	case VSW_LOCALDEV:
-		if (vswp->addr_set == VSW_ADDR_PROMISC) {
-			return (vsw_unset_hw_promisc(vswp, NULL, type));
-
-		} else if (vswp->addr_set == VSW_ADDR_HW) {
-			slot = vswp->addr_slot;
-			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
-				vswp->addr_set = VSW_ADDR_UNSET;
-		}
-
-		break;
-
-	default:
-		/* should never happen */
-		DERR(vswp, "%s: unknown type %d", __func__, type);
-		ASSERT(0);
-		return (1);
+	if (rv == 0) {
+		(void) vsw_set_hw(vswp, port, type);
+		vsw_mac_multicast_add_all(vswp, port, type);
 	}
-
-	D1(vswp, "%s: exit", __func__);
+	RW_MACCL_EXIT(vswp, port, type);
 	return (rv);
 }
 
 /*
- * Attempt to program a unicast address into HW.
+ * Open a MAC client for a port or an interface.
+ * The flags and their purpose as below:
  *
- * Returns 0 on sucess, 1 on failure.
+ *	MAC_OPEN_FLAGS_NO_HWRINGS -- This flag is used by default
+ *	for all ports/interface so that they are associated with
+ *	default group & resources. It will not be used for the
+ *	ports that have HybridIO is enabled so that the h/w resources
+ *	assigned to it.
+ *
+ *	MAC_OPEN_FLAGS_SHARES_DESIRED -- This flag is used to indicate
+ *	that a port desires a Share. This will be the case with the
+ *	the ports that have hybrid mode enabled. This will only cause
+ *	MAC layer to allocate a share and corresponding resources
+ *	ahead of time.
+ *
+ *	MAC_OPEN_FLAGS_TAG_DISABLE -- This flag is used for VLAN
+ *	support. It will cause MAC to not add any tags, but expect
+ *	vsw to tag the packets.
+ *
+ *	MAC_OPEN_FLAGS_STRIP_DISABLE -- This flag is used for VLAN
+ *	support. It will case the MAC layer to not strip the tags.
+ *	Vsw may have to strip the tag for pvid case.
  */
 static int
-vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac)
+vsw_maccl_open(vsw_t *vswp, vsw_port_t *port, int type)
 {
-	void	*mah;
-	int	rv = EINVAL;
-
-	D1(vswp, "%s: enter", __func__);
-
-	ASSERT(MUTEX_HELD(&vswp->hw_lock));
-
-	if (vswp->maddr.maddr_handle == NULL)
-		return (rv);
-
-	mah = vswp->maddr.maddr_handle;
-
-	rv = vswp->maddr.maddr_add(mah, mac);
+	int		rv = 0;
+	int		instance;
+	char		mac_cl_name[MAXNAMELEN];
+	const char	*dev_name;
+	mac_client_handle_t *mchp;
+	uint64_t flags = (MAC_OPEN_FLAGS_NO_HWRINGS |
+	    MAC_OPEN_FLAGS_TAG_DISABLE |
+	    MAC_OPEN_FLAGS_STRIP_DISABLE);
+
+	ASSERT(MUTEX_HELD(&vswp->mac_lock));
+	if (vswp->mh == NULL) {
+		/*
+		 * In case net-dev is changed (either set to nothing or
+		 * using aggregation device), return success here as the
+		 * timeout mechanism will handle it.
+		 */
+		return (0);
+	}
 
-	if (rv == 0)
-		return (rv);
+	mchp = (type == VSW_LOCALDEV) ? &vswp->mch : &port->p_mch;
+	if (*mchp != NULL) {
+		/* already open */
+		return (0);
+	}
+	dev_name = ddi_driver_name(vswp->dip);
+	instance = ddi_get_instance(vswp->dip);
+	if (type == VSW_VNETPORT) {
+		if (port->p_hio_enabled == B_TRUE) {
+			flags &= ~MAC_OPEN_FLAGS_NO_HWRINGS;
+			flags |= MAC_OPEN_FLAGS_SHARES_DESIRED;
+		}
+		(void) snprintf(mac_cl_name, MAXNAMELEN, "%s%d%s%d", dev_name,
+		    instance, "_port", port->p_instance);
+	} else {
+		(void) snprintf(mac_cl_name, MAXNAMELEN, "%s%s%d",
+		    dev_name, "_if", instance);
+	}
 
-	/*
-	 * Its okay for the add to fail because we have exhausted
-	 * all the resouces in the hardware device. Any other error
-	 * we want to flag.
-	 */
-	if (rv != ENOSPC) {
-		cmn_err(CE_NOTE, "!vsw%d: error programming "
-		    "address %s into HW err (%d)",
-		    vswp->instance, ether_sprintf((void *)mac->mma_addr), rv);
+	rv = mac_client_open(vswp->mh, mchp, mac_cl_name, flags);
+	if (rv != 0) {
+		cmn_err(CE_NOTE, "!vsw%d:%s mac_client_open() failed\n",
+		    vswp->instance, mac_cl_name);
 	}
-	D1(vswp, "%s: exit", __func__);
 	return (rv);
 }
 
 /*
- * Remove a unicast mac address which has previously been programmed
- * into HW.
- *
- * Returns 0 on sucess, 1 on failure.
+ * Clean up by removing uncast, multicast addresses and
+ * closing the MAC client for a port or the interface.
  */
-static int
-vsw_unset_hw_addr(vsw_t *vswp, int slot)
+void
+vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type)
 {
-	void	*mah;
-	int	rv;
-
-	D1(vswp, "%s: enter", __func__);
-
-	ASSERT(MUTEX_HELD(&vswp->hw_lock));
-	ASSERT(slot >= 0);
+	WRITE_MACCL_ENTER(vswp, port, type);
+	vsw_unset_hw(vswp, port, type);
+	vsw_maccl_close(vswp, port, type);
+	vsw_mac_multicast_remove_all(vswp, port, type);
+	RW_MACCL_EXIT(vswp, port, type);
+}
 
-	if (vswp->maddr.maddr_handle == NULL)
-		return (1);
+/*
+ * Close a MAC client for a port or an interface.
+ */
+static void
+vsw_maccl_close(vsw_t *vswp, vsw_port_t *port, int type)
+{
+	mac_client_handle_t *mchp;
 
-	mah = vswp->maddr.maddr_handle;
+	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
 
-	rv = vswp->maddr.maddr_remove(mah, slot);
-	if (rv != 0) {
-		DWARN(vswp, "%s: unable to remove address "
-		    "from slot %d in device %s (err %d)",
-		    __func__, slot, vswp->physname, rv);
-		return (1);
+	mchp = (type == VSW_LOCALDEV) ? &vswp->mch : &port->p_mch;
+	if (*mchp != NULL) {
+		mac_client_close(*mchp, 0);
+		*mchp = NULL;
 	}
+}
 
-	D2(vswp, "removed addr from slot %d in device %s",
-	    slot, vswp->physname);
+/*
+ * Cleanup MAC client related stuff for all ports.
+ */
+void
+vsw_mac_cleanup_ports(vsw_t *vswp)
+{
+	vsw_port_list_t		*plist = &vswp->plist;
+	vsw_port_t		*port;
 
-	D1(vswp, "%s: exit", __func__);
-	return (0);
+	READ_ENTER(&plist->lockrw);
+	for (port = plist->head; port != NULL; port = port->p_next) {
+		vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
+	}
+	RW_EXIT(&plist->lockrw);
 }
 
 /*
- * Set network card into promisc mode.
+ * Depending on the mode specified, the capabilites and capacity
+ * of the underlying device setup the physical device.
+ *
+ * If in layer 3 mode, then do nothing.
  *
- * Returns 0 on success, 1 on failure.
+ * If in layer 2 mode, open a mac client and program the mac-address
+ * and vlan-ids. The MAC layer will take care of programming
+ * the address into h/w or set the h/w into promiscuous mode.
+ *
+ * Returns 0 success, 1 on failure.
  */
-static int
-vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
+int
+vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
 {
+	int			err = 1;
+
 	D1(vswp, "%s: enter", __func__);
 
-	ASSERT(MUTEX_HELD(&vswp->hw_lock));
 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
 
-	WRITE_ENTER(&vswp->mac_rwlock);
-	if (vswp->mh == NULL) {
-		RW_EXIT(&vswp->mac_rwlock);
-		return (1);
-	}
-
-	if (vswp->promisc_cnt++ == 0) {
-		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
-			vswp->promisc_cnt--;
-			RW_EXIT(&vswp->mac_rwlock);
-			return (1);
-		}
-		cmn_err(CE_NOTE, "!vsw%d: switching device %s into "
-		    "promiscuous mode", vswp->instance, vswp->physname);
-	}
-	RW_EXIT(&vswp->mac_rwlock);
+	if (vswp->smode == VSW_LAYER3)
+		return (0);
 
 	if (type == VSW_VNETPORT) {
 		ASSERT(port != NULL);
-		port->addr_set = VSW_ADDR_PROMISC;
+		err = vsw_set_port_hw_addr(port);
 	} else {
-		vswp->addr_set = VSW_ADDR_PROMISC;
+		err = vsw_set_if_hw_addr(vswp);
 	}
 
 	D1(vswp, "%s: exit", __func__);
-
-	return (0);
+	return (err);
 }
 
 /*
- * Turn off promiscuous mode on network card.
+ * If in layer 3 mode do nothing.
  *
- * Returns 0 on success, 1 on failure.
+ * If in layer 2 switched mode remove the address from the physical
+ * device.
+ *
+ * If in layer 2 promiscuous mode disable promisc mode.
+ *
+ * Returns 0 on success.
  */
-static int
-vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
+void
+vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
 {
-	vsw_port_list_t 	*plist = &vswp->plist;
-
-	D2(vswp, "%s: enter", __func__);
+	D1(vswp, "%s: enter", __func__);
 
-	ASSERT(MUTEX_HELD(&vswp->hw_lock));
 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
 
-	WRITE_ENTER(&vswp->mac_rwlock);
-	if (vswp->mh == NULL) {
-		RW_EXIT(&vswp->mac_rwlock);
-		return (1);
-	}
-
-	if (--vswp->promisc_cnt == 0) {
-		if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
-			vswp->promisc_cnt++;
-			RW_EXIT(&vswp->mac_rwlock);
-			return (1);
-		}
-
-		/*
-		 * We are exiting promisc mode either because we were
-		 * only in promisc mode because we had failed over from
-		 * switched mode due to HW resource issues, or the user
-		 * wanted the card in promisc mode for all the ports and
-		 * the last port is now being deleted. Tweak the message
-		 * accordingly.
-		 */
-		if (plist->num_ports != 0) {
-			cmn_err(CE_NOTE, "!vsw%d: switching device %s back to "
-			    "programmed mode", vswp->instance, vswp->physname);
-		} else {
-			cmn_err(CE_NOTE, "!vsw%d: switching device %s out of "
-			    "promiscuous mode", vswp->instance, vswp->physname);
-		}
-	}
-	RW_EXIT(&vswp->mac_rwlock);
+	if (vswp->smode == VSW_LAYER3)
+		return;
 
 	if (type == VSW_VNETPORT) {
 		ASSERT(port != NULL);
-		ASSERT(port->addr_set == VSW_ADDR_PROMISC);
-		port->addr_set = VSW_ADDR_UNSET;
+		vsw_unset_hw_addr(vswp, port, type);
 	} else {
-		ASSERT(vswp->addr_set == VSW_ADDR_PROMISC);
-		vswp->addr_set = VSW_ADDR_UNSET;
+		vsw_unset_hw_addr(vswp, NULL, type);
 	}
 
 	D1(vswp, "%s: exit", __func__);
-	return (0);
 }
 
 /*
- * Determine whether or not we are operating in our prefered
- * mode and if not whether the physical resources now allow us
- * to operate in it.
+ * Program the macaddress and vlans of a port.
  *
- * If a port is being removed should only be invoked after port has been
- * removed from the port list.
+ * Returns 0 on sucess, 1 on failure.
  */
-void
-vsw_reconfig_hw(vsw_t *vswp)
+static int
+vsw_set_port_hw_addr(vsw_port_t *port)
 {
-	int			s_idx;
+	vsw_t			*vswp = port->p_vswp;
+	uint16_t		mac_flags = 0;
+	mac_diag_t		diag;
+	uint8_t			*macaddr;
+	uint16_t		vid = VLAN_ID_NONE;
+	int			rv;
 
 	D1(vswp, "%s: enter", __func__);
 
-	ASSERT(MUTEX_HELD(&vswp->hw_lock));
-
-	if (vswp->maddr.maddr_handle == NULL) {
-		return;
-	}
+	ASSERT(RW_WRITE_HELD(&port->maccl_rwlock));
+	if (port->p_mch == NULL)
+		return (0);
 
 	/*
-	 * If we are in layer 2 (i.e. switched) or would like to be
-	 * in layer 2 then check if any ports or the vswitch itself
-	 * need to be programmed into the HW.
-	 *
-	 * This can happen in two cases - switched was specified as
-	 * the prefered mode of operation but we exhausted the HW
-	 * resources and so failed over to the next specifed mode,
-	 * or switched was the only mode specified so after HW
-	 * resources were exhausted there was nothing more we
-	 * could do.
+	 * If the port has a specific 'pvid', then
+	 * register with that vlan-id, otherwise register
+	 * with VLAN_ID_NONE.
 	 */
-	if (vswp->smode_idx > 0)
-		s_idx = vswp->smode_idx - 1;
-	else
-		s_idx = vswp->smode_idx;
-
-	if (vswp->smode[s_idx] != VSW_LAYER2) {
-		return;
+	if (port->pvid != vswp->default_vlan_id) {
+		vid = port->pvid;
 	}
+	macaddr = (uint8_t *)port->p_macaddr.ether_addr_octet;
 
-	D2(vswp, "%s: attempting reconfig..", __func__);
-
-	/*
-	 * First, attempt to set the vswitch mac address into HW,
-	 * if required.
-	 */
-	if (vsw_prog_if(vswp)) {
-		return;
+	if (!(vswp->smode & VSW_LAYER2_PROMISC)) {
+		mac_flags |= MAC_UNICAST_HW;
 	}
 
-	/*
-	 * Next, attempt to set any ports which have not yet been
-	 * programmed into HW.
-	 */
-	if (vsw_prog_ports(vswp)) {
-		return;
+	if (port->addr_set == B_FALSE) {
+		port->p_muh = NULL;
+		rv = mac_unicast_add(port->p_mch, macaddr, mac_flags,
+		    &port->p_muh, vid, &diag);
+
+		if (rv != 0) {
+			cmn_err(CE_WARN, "vsw%d: Failed to program"
+			    "macaddr,vid(%s, %d) err=%d",
+			    vswp->instance, ether_sprintf((void *)macaddr),
+			    vid, rv);
+			return (rv);
+		}
+		port->addr_set = B_TRUE;
+
+		D2(vswp, "%s:programmed macaddr(%s) vid(%d) into device %s",
+		    __func__, ether_sprintf((void *)macaddr), vid,
+		    vswp->physname);
 	}
 
-	/*
-	 * By now we know that have programmed all desired ports etc
-	 * into HW, so safe to mark reconfiguration as complete.
-	 */
-	vswp->recfg_reqd = B_FALSE;
+	/* Add vlans to the MAC layer */
+	vsw_mac_add_vlans(vswp, port->p_mch, macaddr,
+	    mac_flags, port->vids, port->nvids);
 
-	vswp->smode_idx = s_idx;
+	mac_rx_set(port->p_mch, vsw_port_rx_cb, (void *)port);
 
 	D1(vswp, "%s: exit", __func__);
+	return (rv);
 }
 
 /*
- * Check to see if vsw itself is plumbed, and if so whether or not
- * its mac address should be written into HW.
+ * Program the macaddress and vlans of a port.
  *
- * Returns 0 if could set address, or didn't have to set it.
- * Returns 1 if failed to set address.
+ * Returns 0 on sucess, 1 on failure.
  */
 static int
-vsw_prog_if(vsw_t *vswp)
+vsw_set_if_hw_addr(vsw_t *vswp)
 {
-	mac_multi_addr_t	addr;
+	uint16_t		mac_flags = 0;
+	mac_diag_t		diag;
+	uint8_t			*macaddr;
+	uint8_t			primary_addr[ETHERADDRL];
+	uint16_t		vid = VLAN_ID_NONE;
+	int			rv;
 
 	D1(vswp, "%s: enter", __func__);
 
-	ASSERT(MUTEX_HELD(&vswp->hw_lock));
+	ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
+	if (vswp->mch == NULL)
+		return (0);
 
-	READ_ENTER(&vswp->if_lockrw);
-	if ((vswp->if_state & VSW_IF_UP) &&
-	    (vswp->addr_set != VSW_ADDR_HW)) {
+	macaddr = (uint8_t *)vswp->if_addr.ether_addr_octet;
+
+	/* check if it is the primary macaddr of the card. */
+	mac_unicast_primary_get(vswp->mh, primary_addr);
+	if (ether_cmp((void *)primary_addr, (void*)macaddr) == 0) {
+		mac_flags |= MAC_UNICAST_PRIMARY;
+	}
+
+	/*
+	 * If the interface has a specific 'pvid', then
+	 * register with that vlan-id, otherwise register
+	 * with VLAN_ID_NONE.
+	 */
+	if (vswp->pvid != vswp->default_vlan_id) {
+		vid = vswp->pvid;
+	}
 
-		addr.mma_addrlen = ETHERADDRL;
-		ether_copy(&vswp->if_addr, &addr.mma_addr);
+	if (!(vswp->smode & VSW_LAYER2_PROMISC)) {
+		mac_flags |= MAC_UNICAST_HW;
+	}
 
-		if (vsw_set_hw_addr(vswp, &addr) != 0) {
-			RW_EXIT(&vswp->if_lockrw);
-			return (1);
+	if (vswp->addr_set == B_FALSE) {
+		vswp->muh = NULL;
+		rv = mac_unicast_add(vswp->mch, macaddr, mac_flags,
+		    &vswp->muh, vid, &diag);
+
+		if (rv != 0) {
+			cmn_err(CE_WARN, "vsw%d: Failed to program"
+			    "macaddr,vid(%s, %d) err=%d",
+			    vswp->instance, ether_sprintf((void *)macaddr),
+			    vid, rv);
+			return (rv);
 		}
+		vswp->addr_set = B_TRUE;
 
-		vswp->addr_slot = addr.mma_slot;
+		D2(vswp, "%s:programmed macaddr(%s) vid(%d) into device %s",
+		    __func__, ether_sprintf((void *)macaddr), vid,
+		    vswp->physname);
+	}
 
-		/*
-		 * If previously when plumbed had had to place
-		 * interface into promisc mode, now reverse that.
-		 *
-		 * Note that interface will only actually be set into
-		 * non-promisc mode when last port/interface has been
-		 * programmed into HW.
-		 */
-		if (vswp->addr_set == VSW_ADDR_PROMISC)
-			(void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV);
+	vsw_mac_add_vlans(vswp, vswp->mch, macaddr, mac_flags,
+	    vswp->vids, vswp->nvids);
 
-		vswp->addr_set = VSW_ADDR_HW;
-	}
-	RW_EXIT(&vswp->if_lockrw);
+	mac_rx_set(vswp->mch, vsw_if_rx_cb, (void *)vswp);
 
 	D1(vswp, "%s: exit", __func__);
-	return (0);
+	return (rv);
 }
 
 /*
- * Scan the port list for any ports which have not yet been set
- * into HW. For those found attempt to program their mac addresses
- * into the physical device.
+ * Remove a unicast mac address which has previously been programmed
+ * into HW.
  *
- * Returns 0 if able to program all required ports (can be 0) into HW.
- * Returns 1 if failed to set at least one mac address.
+ * Returns 0 on sucess, 1 on failure.
  */
-static int
-vsw_prog_ports(vsw_t *vswp)
+static void
+vsw_unset_hw_addr(vsw_t *vswp, vsw_port_t *port, int type)
 {
-	mac_multi_addr_t	addr;
-	vsw_port_list_t		*plist = &vswp->plist;
-	vsw_port_t		*tp;
-	int			rv = 0;
+	vsw_vlanid_t		*vids;
+	int			nvids;
+	mac_client_handle_t	mch = NULL;
 
 	D1(vswp, "%s: enter", __func__);
 
-	ASSERT(MUTEX_HELD(&vswp->hw_lock));
+	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
 
-	READ_ENTER(&plist->lockrw);
-	for (tp = plist->head; tp != NULL; tp = tp->p_next) {
-		if (tp->addr_set != VSW_ADDR_HW) {
-			addr.mma_addrlen = ETHERADDRL;
-			ether_copy(&tp->p_macaddr, &addr.mma_addr);
-
-			if (vsw_set_hw_addr(vswp, &addr) != 0) {
-				rv = 1;
-				break;
-			}
-
-			tp->addr_slot = addr.mma_slot;
-
-			/*
-			 * If when this port had first attached we had
-			 * had to place the interface into promisc mode,
-			 * then now reverse that.
-			 *
-			 * Note that the interface will not actually
-			 * change to non-promisc mode until all ports
-			 * have been programmed.
-			 */
-			if (tp->addr_set == VSW_ADDR_PROMISC)
-				(void) vsw_unset_hw_promisc(vswp,
-				    tp, VSW_VNETPORT);
-
-			tp->addr_set = VSW_ADDR_HW;
-		}
+	if (type == VSW_VNETPORT) {
+		ASSERT(port != NULL);
+		ASSERT(RW_WRITE_HELD(&port->maccl_rwlock));
+		vids = port->vids;
+		nvids = port->nvids;
+	} else {
+		ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
+		vids = vswp->vids;
+		nvids = vswp->nvids;
 	}
-	RW_EXIT(&plist->lockrw);
 
-	D1(vswp, "%s: exit", __func__);
-	return (rv);
-}
+	/* First clear the callback */
+	if (type == VSW_LOCALDEV) {
+		mch = vswp->mch;
+	} else if (type == VSW_VNETPORT) {
+		mch = port->p_mch;
+	}
 
-static void
-vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
-{
-	ringp->ring_state = VSW_MAC_RING_FREE;
-	ringp->ring_arg = NULL;
-	ringp->ring_blank = NULL;
-	ringp->ring_vqp = NULL;
-	ringp->ring_vswp = vswp;
-}
 
-static void
-vsw_mac_ring_tbl_init(vsw_t *vswp)
-{
-	int		i;
+	if (mch == NULL) {
+		return;
+	}
 
-	mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
+	mac_rx_clear(mch);
 
-	vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
-	vswp->mac_ring_tbl  =
-	    kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), KM_SLEEP);
+	/* Remove vlans */
+	vsw_mac_remove_vlans(mch, vids, nvids);
 
-	for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
-		vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
-}
+	if ((type == VSW_LOCALDEV) && (vswp->addr_set == B_TRUE)) {
+		(void) mac_unicast_remove(vswp->mch, vswp->muh);
+		vswp->muh = NULL;
+		D2(vswp, "removed vsw interface mac-addr from "
+		    "the device %s", vswp->physname);
+		vswp->addr_set = B_FALSE;
 
-static void
-vsw_mac_ring_tbl_destroy(vsw_t *vswp)
-{
-	int		i;
-	vsw_mac_ring_t	*ringp;
-
-	mutex_enter(&vswp->mac_ring_lock);
-	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
-		ringp = &vswp->mac_ring_tbl[i];
-
-		if (ringp->ring_state != VSW_MAC_RING_FREE) {
-			/*
-			 * Destroy the queue.
-			 */
-			vsw_queue_stop(ringp->ring_vqp);
-			vsw_queue_destroy(ringp->ring_vqp);
-
-			/*
-			 * Re-initialize the structure.
-			 */
-			vsw_mac_ring_tbl_entry_init(vswp, ringp);
-		}
+	} else if ((type == VSW_VNETPORT) && (port->addr_set == B_TRUE)) {
+		(void) mac_unicast_remove(port->p_mch, port->p_muh);
+		port->p_muh = NULL;
+		D2(vswp, "removed port(0x%p) mac-addr from "
+		    "the device %s", port, vswp->physname);
+		port->addr_set = B_FALSE;
 	}
-	mutex_exit(&vswp->mac_ring_lock);
 
-	mutex_destroy(&vswp->mac_ring_lock);
-	kmem_free(vswp->mac_ring_tbl,
-	    vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
-	vswp->mac_ring_tbl_sz = 0;
+	D1(vswp, "%s: exit", __func__);
 }
 
 /*
- * Handle resource add callbacks from the driver below.
+ * receive callback routine for vsw interface. Invoked by MAC layer when there
+ * are pkts being passed up from physical device for this vsw interface.
  */
-static mac_resource_handle_t
-vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
+/* ARGSUSED */
+static void
+vsw_if_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
 {
+	_NOTE(ARGUNUSED(mrh))
+
 	vsw_t		*vswp = (vsw_t *)arg;
-	mac_rx_fifo_t	*mrfp = (mac_rx_fifo_t *)mrp;
-	vsw_mac_ring_t	*ringp;
-	vsw_queue_t	*vqp;
-	int		i;
+	mblk_t		*mpt;
+	int		count;
 
 	ASSERT(vswp != NULL);
-	ASSERT(mrp != NULL);
-	ASSERT(vswp->mac_ring_tbl != NULL);
 
 	D1(vswp, "%s: enter", __func__);
 
-	/*
-	 * Check to make sure we have the correct resource type.
-	 */
-	if (mrp->mr_type != MAC_RX_FIFO)
-		return (NULL);
-
-	/*
-	 * Find a open entry in the ring table.
-	 */
-	mutex_enter(&vswp->mac_ring_lock);
-	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
-		ringp = &vswp->mac_ring_tbl[i];
-
-		/*
-		 * Check for an empty slot, if found, then setup queue
-		 * and thread.
-		 */
-		if (ringp->ring_state == VSW_MAC_RING_FREE) {
-			/*
-			 * Create the queue for this ring.
-			 */
-			vqp = vsw_queue_create();
-
-			/*
-			 * Initialize the ring data structure.
-			 */
-			ringp->ring_vqp = vqp;
-			ringp->ring_arg = mrfp->mrf_arg;
-			ringp->ring_blank = mrfp->mrf_blank;
-			ringp->ring_state = VSW_MAC_RING_INUSE;
-
-			/*
-			 * Create the worker thread.
-			 */
-			vqp->vq_worker = thread_create(NULL, 0,
-			    vsw_queue_worker, ringp, 0, &p0,
-			    TS_RUN, minclsyspri);
-			if (vqp->vq_worker == NULL) {
-				vsw_queue_destroy(vqp);
-				vsw_mac_ring_tbl_entry_init(vswp, ringp);
-				ringp = NULL;
-			}
-
-			if (ringp != NULL) {
-				/*
-				 * Make sure thread get's running state for
-				 * this ring.
-				 */
-				mutex_enter(&vqp->vq_lock);
-				while ((vqp->vq_state != VSW_QUEUE_RUNNING) &&
-				    (vqp->vq_state != VSW_QUEUE_DRAINED)) {
-					cv_wait(&vqp->vq_cv, &vqp->vq_lock);
-				}
-
-				/*
-				 * If the thread is not running, cleanup.
-				 */
-				if (vqp->vq_state == VSW_QUEUE_DRAINED) {
-					vsw_queue_destroy(vqp);
-					vsw_mac_ring_tbl_entry_init(vswp,
-					    ringp);
-					ringp = NULL;
-				}
-				mutex_exit(&vqp->vq_lock);
-			}
-
-			mutex_exit(&vswp->mac_ring_lock);
-			D1(vswp, "%s: exit", __func__);
-			return ((mac_resource_handle_t)ringp);
+	READ_ENTER(&vswp->if_lockrw);
+	if (vswp->if_state & VSW_IF_UP) {
+		RW_EXIT(&vswp->if_lockrw);
+		count = vsw_vlan_frame_untag(vswp, VSW_LOCALDEV, &mp, &mpt);
+		if (count != 0) {
+			mac_rx(vswp->if_mh, NULL, mp);
 		}
+	} else {
+		RW_EXIT(&vswp->if_lockrw);
+		freemsgchain(mp);
 	}
-	mutex_exit(&vswp->mac_ring_lock);
 
-	/*
-	 * No slots in the ring table available.
-	 */
 	D1(vswp, "%s: exit", __func__);
-	return (NULL);
 }
 
+/*
+ * receive callback routine for port. Invoked by MAC layer when there
+ * are pkts being passed up from physical device for this port.
+ */
+/* ARGSUSED */
 static void
-vsw_queue_stop(vsw_queue_t *vqp)
+vsw_port_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
 {
-	mutex_enter(&vqp->vq_lock);
+	_NOTE(ARGUNUSED(mrh))
 
-	if (vqp->vq_state == VSW_QUEUE_RUNNING) {
-		vqp->vq_state = VSW_QUEUE_STOP;
-		cv_signal(&vqp->vq_cv);
+	vsw_t		*vswp;
+	vsw_port_t	*port = arg;
 
-		while (vqp->vq_state != VSW_QUEUE_DRAINED)
-			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
-	}
+	ASSERT(port != NULL);
+
+	vswp = port->p_vswp;
 
-	vqp->vq_state = VSW_QUEUE_STOPPED;
+	D1(vswp, "vsw_port_rx_cb: enter");
 
-	mutex_exit(&vqp->vq_lock);
+	/*
+	 * Send the packets to the peer directly.
+	 */
+	(void) vsw_portsend(port, mp);
+
+	D1(vswp, "vsw_port_rx_cb: exit");
 }
 
-static vsw_queue_t *
-vsw_queue_create()
+/*
+ * Send a message out over the physical device
+ * via the MAC layer.
+ *
+ * Returns any mblks that it was unable to transmit.
+ */
+mblk_t *
+vsw_tx_msg(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port)
 {
-	vsw_queue_t *vqp;
+	mac_client_handle_t	mch;
+	mac_unicast_handle_t	muh;
 
-	vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
+	READ_MACCL_ENTER(vswp, port, caller);
 
-	mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
-	cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
-	vqp->vq_first = NULL;
-	vqp->vq_last = NULL;
-	vqp->vq_state = VSW_QUEUE_STOPPED;
+	mch = (caller == VSW_LOCALDEV) ? vswp->mch : port->p_mch;
+	muh = (caller == VSW_LOCALDEV) ? vswp->muh : port->p_muh;
 
-	return (vqp);
-}
+	if ((mch != NULL) && (muh != NULL)) {
+		/* packets are sent or dropped */
+		(void) mac_tx(mch, mp, 0, MAC_DROP_ON_NO_DESC, NULL);
+	}
 
-static void
-vsw_queue_destroy(vsw_queue_t *vqp)
-{
-	cv_destroy(&vqp->vq_cv);
-	mutex_destroy(&vqp->vq_lock);
-	kmem_free(vqp, sizeof (vsw_queue_t));
+	RW_MACCL_EXIT(vswp, port, caller);
+	return (NULL);
 }
 
-static void
-vsw_queue_worker(vsw_mac_ring_t *rrp)
+/*
+ * vsw_port_mac_reconfig -- Cleanup and close the MAC client
+ * and reopen and re-configure the MAC client with new flags etc.
+ * This function is useful for two different purposes:
+ *	1) To update the MAC client with new vlan-ids. This is done
+ *	   by freeing the existing vlan-ids and reopen with the new
+ *	   vlan-ids.
+ *
+ *	2) If the Hybrid mode status of a port changes, then the
+ *	   MAC client need to be closed and re-opened, otherwise,
+ *	   Share related resources may not be freed(hybird mode disabled)
+ *	   or assigned(hybrid mode enabled). To accomplish this,
+ *	   this function simply closes and reopens the MAC client.
+ *	   The reopen will result in using the flags based on the
+ *	   new hybrid mode of the port.
+ */
+void
+vsw_port_mac_reconfig(vsw_port_t *portp, boolean_t update_vlans,
+    uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids)
 {
-	mblk_t		*mp;
-	vsw_queue_t	*vqp = rrp->ring_vqp;
-	vsw_t		*vswp = rrp->ring_vswp;
-
-	mutex_enter(&vqp->vq_lock);
-
-	ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED);
+	vsw_t *vswp = portp->p_vswp;
+	int rv;
 
+	D1(vswp, "%s: enter", __func__);
 	/*
-	 * Set the state to running, since the thread is now active.
+	 * Remove the multi-cast addresses, unicast address
+	 * and close the mac-client.
 	 */
-	vqp->vq_state = VSW_QUEUE_RUNNING;
-	cv_signal(&vqp->vq_cv);
-
-	while (vqp->vq_state == VSW_QUEUE_RUNNING) {
-		/*
-		 * Wait for work to do or the state has changed
-		 * to not running.
-		 */
-		while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
-		    (vqp->vq_first == NULL)) {
-			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
-		}
-
-		/*
-		 * Process packets that we received from the interface.
-		 */
-		if (vqp->vq_first != NULL) {
-			mp = vqp->vq_first;
-
-			vqp->vq_first = NULL;
-			vqp->vq_last = NULL;
-
-			mutex_exit(&vqp->vq_lock);
-
-			/* switch the chain of packets received */
-			vswp->vsw_switch_frame(vswp, mp,
-			    VSW_PHYSDEV, NULL, NULL);
-
-			mutex_enter(&vqp->vq_lock);
+	mutex_enter(&vswp->mac_lock);
+	WRITE_ENTER(&portp->maccl_rwlock);
+	vsw_mac_multicast_remove_all(vswp, portp, VSW_VNETPORT);
+	vsw_unset_hw(vswp, portp, VSW_VNETPORT);
+	vsw_maccl_close(vswp, portp, VSW_VNETPORT);
+
+	if (update_vlans == B_TRUE) {
+		if (portp->nvids != 0) {
+			kmem_free(portp->vids,
+			    sizeof (vsw_vlanid_t) * portp->nvids);
+			portp->vids = NULL;
+			portp->nvids = 0;
 		}
+		portp->vids = new_vids;
+		portp->nvids = new_nvids;
+		portp->pvid = new_pvid;
 	}
 
 	/*
-	 * We are drained and signal we are done.
+	 * Now re-open the mac-client and
+	 * configure unicast addr and multicast addrs.
 	 */
-	vqp->vq_state = VSW_QUEUE_DRAINED;
-	cv_signal(&vqp->vq_cv);
+	rv = vsw_maccl_open(vswp, portp, VSW_VNETPORT);
+	if (rv != 0) {
+		goto recret;
+	}
 
-	/*
-	 * Exit lock and drain the remaining packets.
-	 */
-	mutex_exit(&vqp->vq_lock);
+	if (vsw_set_hw(vswp, portp, VSW_VNETPORT)) {
+		cmn_err(CE_NOTE, "!vsw%d: port:%d failed to "
+		    "set unicast address\n", vswp->instance, portp->p_instance);
+		goto recret;
+	}
 
-	/*
-	 * Exit the thread
-	 */
-	thread_exit();
+	vsw_mac_multicast_add_all(vswp, portp, VSW_VNETPORT);
+
+recret:
+	RW_EXIT(&portp->maccl_rwlock);
+	mutex_exit(&vswp->mac_lock);
+	D1(vswp, "%s: exit", __func__);
 }
 
 /*
- * static void
- * vsw_rx_queue_cb() - Receive callback routine when
- *	vsw_multi_ring_enable is non-zero.  Queue the packets
- *	to a packet queue for a worker thread to process.
+ * vsw_if_mac_reconfig -- Reconfigure the vsw interfaace's mac-client
+ * by closing and re-opening it. This function is used handle the
+ * following two cases:
+ *
+ *	1) Handle the MAC address change for the interface.
+ *	2) Handle vlan update.
  */
-static void
-vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+void
+vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans,
+    uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids)
 {
-	vsw_mac_ring_t	*ringp = (vsw_mac_ring_t *)mrh;
-	vsw_t		*vswp = (vsw_t *)arg;
-	vsw_queue_t	*vqp;
-	mblk_t		*bp, *last;
-
-	ASSERT(mrh != NULL);
-	ASSERT(vswp != NULL);
-	ASSERT(mp != NULL);
+	int rv;
 
 	D1(vswp, "%s: enter", __func__);
-
 	/*
-	 * Find the last element in the mblk chain.
+	 * Remove the multi-cast addresses, unicast address
+	 * and close the mac-client.
 	 */
-	bp = mp;
-	do {
-		last = bp;
-		bp = bp->b_next;
-	} while (bp != NULL);
-
-	/* Get the queue for the packets */
-	vqp = ringp->ring_vqp;
-
-	/*
-	 * Grab the lock such we can queue the packets.
-	 */
-	mutex_enter(&vqp->vq_lock);
-
-	if (vqp->vq_state != VSW_QUEUE_RUNNING) {
-		freemsgchain(mp);
-		mutex_exit(&vqp->vq_lock);
-		goto vsw_rx_queue_cb_exit;
+	mutex_enter(&vswp->mac_lock);
+	WRITE_ENTER(&vswp->maccl_rwlock);
+	vsw_mac_multicast_remove_all(vswp, NULL, VSW_LOCALDEV);
+	vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
+	vsw_maccl_close(vswp, NULL, VSW_LOCALDEV);
+
+	if (update_vlans == B_TRUE) {
+		if (vswp->nvids != 0) {
+			kmem_free(vswp->vids,
+			    sizeof (vsw_vlanid_t) * vswp->nvids);
+			vswp->vids = NULL;
+			vswp->nvids = 0;
+		}
+		vswp->vids = new_vids;
+		vswp->nvids = new_nvids;
+		vswp->pvid = new_pvid;
 	}
 
 	/*
-	 * Add the mblk chain to the queue.  If there
-	 * is some mblks in the queue, then add the new
-	 * chain to the end.
+	 * Now re-open the mac-client and
+	 * configure unicast addr and multicast addrs.
 	 */
-	if (vqp->vq_first == NULL)
-		vqp->vq_first = mp;
-	else
-		vqp->vq_last->b_next = mp;
-
-	vqp->vq_last = last;
+	rv = vsw_maccl_open(vswp, NULL, VSW_LOCALDEV);
+	if (rv != 0) {
+		goto ifrecret;
+	}
 
-	/*
-	 * Signal the worker thread that there is work to
-	 * do.
-	 */
-	cv_signal(&vqp->vq_cv);
+	if (vsw_set_hw(vswp, NULL, VSW_LOCALDEV)) {
+		cmn_err(CE_NOTE, "!vsw%d:failed to set unicast address\n",
+		    vswp->instance);
+		goto ifrecret;
+	}
 
-	/*
-	 * Let go of the lock and exit.
-	 */
-	mutex_exit(&vqp->vq_lock);
+	vsw_mac_multicast_add_all(vswp, NULL, VSW_LOCALDEV);
 
-vsw_rx_queue_cb_exit:
+ifrecret:
+	RW_EXIT(&vswp->maccl_rwlock);
+	mutex_exit(&vswp->mac_lock);
 	D1(vswp, "%s: exit", __func__);
 }
 
 /*
- * receive callback routine. Invoked by MAC layer when there
- * are pkts being passed up from physical device.
+ * vsw_mac_port_reconfig_vlans -- Reconfigure a port to handle
+ * vlan configuration update. As the removal of the last unicast-address,vid
+ * from the MAC client results in releasing all resources, it expects
+ * no Shares to be associated with such MAC client.
  *
- * PERF: It may be more efficient when the card is in promisc
- * mode to check the dest address of the pkts here (against
- * the FDB) rather than checking later. Needs to be investigated.
+ * To handle vlan configuration update for a port that already has
+ * a Share bound, then we need to free that share prior to reconfiguration.
+ * Initiate the hybrdIO setup again after the completion of reconfiguration.
  */
-static void
-vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+void
+vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid,
+    vsw_vlanid_t *new_vids, int new_nvids)
 {
-	_NOTE(ARGUNUSED(mrh))
-
-	vsw_t		*vswp = (vsw_t *)arg;
+	/*
+	 * As the reconfiguration involves the close of
+	 * mac client, cleanup HybridIO and later restart
+	 * HybridIO setup again.
+	 */
+	if (portp->p_hio_enabled == B_TRUE) {
+		vsw_hio_stop_port(portp);
+	}
+	vsw_port_mac_reconfig(portp, B_TRUE, new_pvid, new_vids, new_nvids);
+	if (portp->p_hio_enabled == B_TRUE) {
+		/* reset to setup the HybridIO again. */
+		vsw_hio_port_reset(portp, B_FALSE);
+	}
+}
 
-	ASSERT(vswp != NULL);
+/* Add vlans to MAC client */
+static void
+vsw_mac_add_vlans(vsw_t *vswp, mac_client_handle_t mch, uint8_t *macaddr,
+    uint16_t flags, vsw_vlanid_t *vids, int nvids)
+{
+	vsw_vlanid_t	*vidp;
+	mac_diag_t	diag;
+	int		rv;
+	int		i;
 
-	D1(vswp, "vsw_rx_cb: enter");
+	/* Add vlans to the MAC layer */
+	for (i = 0; i < nvids; i++) {
+		vidp = &vids[i];
 
-	/* switch the chain of packets received */
-	vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
+		if (vidp->vl_set == B_TRUE) {
+			continue;
+		}
 
-	D1(vswp, "vsw_rx_cb: exit");
+		rv = mac_unicast_add(mch, macaddr, flags,
+		    &vidp->vl_muh, vidp->vl_vid, &diag);
+		if (rv != 0) {
+			cmn_err(CE_WARN, "vsw%d: Failed to program"
+			    "macaddr,vid(%s, %d) err=%d",
+			    vswp->instance, ether_sprintf((void *)macaddr),
+			    vidp->vl_vid, rv);
+		} else {
+			vidp->vl_set = B_TRUE;
+			D2(vswp, "%s:programmed macaddr(%s) vid(%d) "
+			    "into device %s", __func__,
+			    ether_sprintf((void *)macaddr),
+			    vidp->vl_vid, vswp->physname);
+		}
+	}
 }
 
-/*
- * Send a message out over the physical device via the MAC layer.
- *
- * Returns any mblks that it was unable to transmit.
- */
-mblk_t *
-vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
+/* Remove vlans from the MAC client */
+static void
+vsw_mac_remove_vlans(mac_client_handle_t mch, vsw_vlanid_t *vids, int nvids)
 {
-	const mac_txinfo_t	*mtp;
+	int i;
+	vsw_vlanid_t *vidp;
 
-	READ_ENTER(&vswp->mac_rwlock);
-	if ((vswp->mh == NULL) || (vswp->mstarted == B_FALSE)) {
-
-		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
-		RW_EXIT(&vswp->mac_rwlock);
-		return (mp);
-	} else {
-		mtp = vswp->txinfo;
-		mp = mtp->mt_fn(mtp->mt_arg, mp);
+	for (i = 0; i < nvids; i++) {
+		vidp = &vids[i];
+		if (vidp->vl_set == B_FALSE) {
+			continue;
+		}
+		mac_unicast_remove(mch, vidp->vl_muh);
+		vidp->vl_set = B_FALSE;
 	}
-	RW_EXIT(&vswp->mac_rwlock);
-
-	return (mp);
 }
 
 #define	ARH_FIXED_LEN	8    /* Length of fixed part of ARP header(see arp.h) */
@@ -1386,7 +1159,7 @@ vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
  * vsw_publish_macaddr_count to zero in /etc/system.
  */
 void
-vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr)
+vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp)
 {
 	mblk_t			*mp;
 	mblk_t			*bp;
@@ -1404,7 +1177,7 @@ vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr)
 	/* Initialize eth header */
 	ehp = (struct  ether_header *)mp->b_rptr;
 	bcopy(&etherbroadcastaddr, &ehp->ether_dhost, ETHERADDRL);
-	bcopy(addr, &ehp->ether_shost, ETHERADDRL);
+	bcopy(&portp->p_macaddr, &ehp->ether_shost, ETHERADDRL);
 	ehp->ether_type = htons(ETHERTYPE_REVARP);
 
 	/* Initialize arp packet */
@@ -1420,13 +1193,13 @@ vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr)
 	cp += ARH_FIXED_LEN;
 
 	/* Sender's hardware address and protocol address */
-	bcopy(addr, cp, ETHERADDRL);
+	bcopy(&portp->p_macaddr, cp, ETHERADDRL);
 	cp += ETHERADDRL;
 	bzero(cp, plen);	/* INADDR_ANY */
 	cp += plen;
 
 	/* Target hardware address and protocol address */
-	bcopy(addr, cp, ETHERADDRL);
+	bcopy(&portp->p_macaddr, cp, ETHERADDRL);
 	cp += ETHERADDRL;
 	bzero(cp, plen);	/* INADDR_ANY */
 	cp += plen;
@@ -1441,7 +1214,7 @@ vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr)
 		}
 
 		/* transmit the packet */
-		bp = vsw_tx_msg(vswp, bp);
+		bp = vsw_tx_msg(vswp, bp, VSW_VNETPORT, portp);
 		if (bp != NULL) {
 			freemsg(bp);
 		}
@@ -1453,50 +1226,18 @@ vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr)
 static void
 vsw_mac_set_mtu(vsw_t *vswp, uint32_t mtu)
 {
-	mac_prop_t	mp;
-	uint32_t	val;
-	int		rv;
-	uint_t		perm_flags = MAC_PROP_PERM_RW;
-	mp.mp_id = MAC_PROP_MTU;
-	mp.mp_name = mac_mtu_propname;
-	mp.mp_flags = 0;
-
-	/* Get the mtu of the physical device */
-	rv = mac_get_prop(vswp->mh, &mp, (void *)&val, sizeof (uint32_t),
-	    &perm_flags);
-	if (rv != 0) {
-		cmn_err(CE_NOTE,
-		    "!vsw%d: Unable to get the mtu of the physical device:%s\n",
-		    vswp->instance, vswp->physname);
-		return;
-	}
-
-	/* Return if the mtu is read-only */
-	if (perm_flags != MAC_PROP_PERM_RW) {
-		cmn_err(CE_NOTE,
-		    "!vsw%d: Read-only mtu of the physical device:%s\n",
-		    vswp->instance, vswp->physname);
-		return;
-	}
-
-	/* save the original mtu of physdev to reset it back later if needed */
-	vswp->mtu_physdev_orig = val;
-
-	if (val == mtu) {
-		/* no need to set, as the device already has the right mtu */
-		return;
-	}
-
-	mp.mp_id = MAC_PROP_MTU;
-	mp.mp_name = mac_mtu_propname;
-	mp.mp_flags = 0;
+	uint_t	mtu_orig;
+	int	rv;
 
-	/* Set the mtu in the physical device */
-	rv = mac_set_prop(vswp->mh, &mp, &mtu, sizeof (uint32_t));
+	rv = mac_set_mtu(vswp->mh, mtu, &mtu_orig);
 	if (rv != 0) {
 		cmn_err(CE_NOTE,
 		    "!vsw%d: Unable to set the mtu:%d, in the "
 		    "physical device:%s\n",
 		    vswp->instance, mtu, vswp->physname);
+		return;
 	}
+
+	/* save the original mtu of physdev to reset it back later if needed */
+	vswp->mtu_physdev_orig = mtu_orig;
 }
diff --git a/usr/src/uts/sun4v/io/vsw_switching.c b/usr/src/uts/sun4v/io/vsw_switching.c
index 8c4ad6d4d0..5033f0665c 100644
--- a/usr/src/uts/sun4v/io/vsw_switching.c
+++ b/usr/src/uts/sun4v/io/vsw_switching.c
@@ -58,7 +58,6 @@
 #include <sys/taskq.h>
 #include <sys/note.h>
 #include <sys/mach_descrip.h>
-#include <sys/mac.h>
 #include <sys/mdeg.h>
 #include <sys/ldc.h>
 #include <sys/vsw_fdb.h>
@@ -82,6 +81,8 @@ static	int vsw_setup_layer2(vsw_t *);
 static	int vsw_setup_layer3(vsw_t *);
 
 /* Switching/data transmit routines */
+static	void vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
+    vsw_port_t *port, mac_resource_handle_t);
 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
 	vsw_port_t *port, mac_resource_handle_t);
 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
@@ -117,26 +118,26 @@ void vsw_del_mcst_vsw(vsw_t *);
 
 /* Support functions */
 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
-static uint32_t vsw_get_same_dest_list(struct ether_header *ehp,
-    mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
+static mblk_t *vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp);
 
 
 /*
  * Functions imported from other files.
  */
-extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
+extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
 extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
 extern int vsw_mac_open(vsw_t *vswp);
 extern void vsw_mac_close(vsw_t *vswp);
 extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
     mblk_t *mp, vsw_macrx_flags_t flags);
 extern void vsw_set_addrs(vsw_t *vswp);
-extern int vsw_get_hw_maddr(vsw_t *);
-extern int vsw_mac_attach(vsw_t *vswp);
-extern int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt,
-	uint32_t count);
+extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
 extern void vsw_hio_init(vsw_t *vswp);
 extern void vsw_hio_start_ports(vsw_t *vswp);
+extern int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port,
+    mcst_addr_t *mcst_p, int type);
+extern void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
+    mcst_addr_t *mcst_p, int type);
 
 /*
  * Tunables used in this file.
@@ -226,9 +227,9 @@ vsw_stop_switching_timeout(vsw_t *vswp)
 
 	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
 
-	WRITE_ENTER(&vswp->mac_rwlock);
+	mutex_enter(&vswp->mac_lock);
 	vswp->mac_open_retries = 0;
-	RW_EXIT(&vswp->mac_rwlock);
+	mutex_exit(&vswp->mac_lock);
 }
 
 /*
@@ -246,39 +247,24 @@ vsw_stop_switching_timeout(vsw_t *vswp)
 int
 vsw_setup_switching(vsw_t *vswp)
 {
-	int	i, rv = 1;
+	int	rv = 1;
 
 	D1(vswp, "%s: enter", __func__);
 
 	/*
 	 * Select best switching mode.
-	 * Note that we start from the saved smode_idx. This is done as
-	 * this routine can be called from the timeout handler to retry
-	 * setting up a specific mode. Currently only the function which
-	 * sets up layer2/promisc mode returns EAGAIN if the underlying
-	 * physical device is not available yet, causing retries.
+	 * This is done as this routine can be called from the timeout
+	 * handler to retry setting up a specific mode. Currently only
+	 * the function which sets up layer2/promisc mode returns EAGAIN
+	 * if the underlying network device is not available yet, causing
+	 * retries.
 	 */
-	for (i = vswp->smode_idx; i < vswp->smode_num; i++) {
-		vswp->smode_idx = i;
-		switch (vswp->smode[i]) {
-		case VSW_LAYER2:
-		case VSW_LAYER2_PROMISC:
-			rv = vsw_setup_layer2(vswp);
-			break;
-
-		case VSW_LAYER3:
-			rv = vsw_setup_layer3(vswp);
-			break;
-
-		default:
-			DERR(vswp, "unknown switch mode");
-			break;
-		}
-
-		if ((rv == 0) || (rv == EAGAIN))
-			break;
-
-		/* all other errors(rv != 0): continue & select the next mode */
+	if (vswp->smode & VSW_LAYER2) {
+		rv = vsw_setup_layer2(vswp);
+	} else if (vswp->smode & VSW_LAYER3) {
+		rv = vsw_setup_layer3(vswp);
+	} else {
+		DERR(vswp, "unknown switch mode");
 		rv = 1;
 	}
 
@@ -290,7 +276,7 @@ vsw_setup_switching(vsw_t *vswp)
 	}
 
 	D2(vswp, "%s: Operating in mode %d", __func__,
-	    vswp->smode[vswp->smode_idx]);
+	    vswp->smode);
 
 	D1(vswp, "%s: exit", __func__);
 
@@ -312,7 +298,12 @@ vsw_setup_layer2(vsw_t *vswp)
 
 	D1(vswp, "%s: enter", __func__);
 
+	/*
+	 * Until the network device is successfully opened,
+	 * set the switching to use vsw_switch_l2_frame.
+	 */
 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
+	vswp->mac_cl_switching = B_FALSE;
 
 	rv = strlen(vswp->physname);
 	if (rv == 0) {
@@ -320,61 +311,42 @@ vsw_setup_layer2(vsw_t *vswp)
 		 * Physical device name is NULL, which is
 		 * required for layer 2.
 		 */
-		cmn_err(CE_WARN, "!vsw%d: no physical device name specified",
+		cmn_err(CE_WARN, "!vsw%d: no network device name specified",
 		    vswp->instance);
 		return (EIO);
 	}
 
-	WRITE_ENTER(&vswp->mac_rwlock);
+	mutex_enter(&vswp->mac_lock);
 
 	rv = vsw_mac_open(vswp);
 	if (rv != 0) {
 		if (rv != EAGAIN) {
-			cmn_err(CE_WARN, "!vsw%d: Unable to open physical "
+			cmn_err(CE_WARN, "!vsw%d: Unable to open network "
 			    "device: %s\n", vswp->instance, vswp->physname);
 		}
-		RW_EXIT(&vswp->mac_rwlock);
+		mutex_exit(&vswp->mac_lock);
 		return (rv);
 	}
 
-	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
-		/*
-		 * Verify that underlying device can support multiple
-		 * unicast mac addresses.
-		 */
-		rv = vsw_get_hw_maddr(vswp);
-		if (rv != 0) {
-			goto exit_error;
-		}
-	}
-
 	/*
-	 * Attempt to link into the MAC layer so we can get
-	 * and send packets out over the physical adapter.
+	 * Now we can use the mac client switching, so set the switching
+	 * function to use vsw_switch_l2_frame_mac_client(), which simply
+	 * sends the packets to MAC layer for switching.
 	 */
-	rv = vsw_mac_attach(vswp);
-	if (rv != 0) {
-		/*
-		 * Registration with the MAC layer has failed,
-		 * so return error so that can fall back to next
-		 * prefered switching method.
-		 */
-		cmn_err(CE_WARN, "!vsw%d: Unable to setup physical device: "
-		    "%s\n", vswp->instance, vswp->physname);
-		goto exit_error;
-	}
+	vswp->vsw_switch_frame = vsw_switch_l2_frame_mac_client;
+	vswp->mac_cl_switching = B_TRUE;
 
 	D1(vswp, "%s: exit", __func__);
 
-	RW_EXIT(&vswp->mac_rwlock);
-
 	/* Initialize HybridIO related stuff */
 	vsw_hio_init(vswp);
+
+	mutex_exit(&vswp->mac_lock);
 	return (0);
 
 exit_error:
 	vsw_mac_close(vswp);
-	RW_EXIT(&vswp->mac_rwlock);
+	mutex_exit(&vswp->mac_lock);
 	return (EIO);
 }
 
@@ -400,6 +372,31 @@ vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port,
 }
 
 /*
+ * Use mac client for layer 2 switching .
+ */
+static void
+vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
+    vsw_port_t *port, mac_resource_handle_t mrh)
+{
+	_NOTE(ARGUNUSED(mrh))
+
+	mblk_t		*ret_m;
+
+	/*
+	 * This switching function is expected to be called by
+	 * the ports or the interface only. The packets from
+	 * physical interface already switched.
+	 */
+	ASSERT((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV));
+
+	if ((ret_m = vsw_tx_msg(vswp, mp, caller, port)) != NULL) {
+		DERR(vswp, "%s: drop mblks to "
+		    "phys dev", __func__);
+		freemsgchain(ret_m);
+	}
+}
+
+/*
  * Switch the given ethernet frame when operating in layer 2 mode.
  *
  * vswp: pointer to the vsw instance
@@ -419,8 +416,6 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
 {
 	struct ether_header	*ehp;
 	mblk_t			*bp, *ret_m;
-	mblk_t			*mpt = NULL;
-	uint32_t		count;
 	vsw_fdbe_t		*fp;
 
 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
@@ -435,8 +430,8 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
 	bp = mp;
 	while (bp) {
 		ehp = (struct ether_header *)bp->b_rptr;
-		count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp);
-		ASSERT(count != 0);
+		mp = vsw_get_same_dest_list(ehp, &bp);
+		ASSERT(mp != NULL);
 
 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
 		    __func__, MBLKSIZE(mp), MBLKL(mp));
@@ -476,7 +471,7 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
 			 * vsw_port (connected to a vnet device -
 			 * VSW_VNETPORT)
 			 */
-			(void) vsw_portsend(fp->portp, mp, mpt, count);
+			(void) vsw_portsend(fp->portp, mp);
 
 			/* Release the reference on the fdb entry */
 			VSW_FDBE_REFRELE(fp);
@@ -517,8 +512,8 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
 					    VSW_MACRX_PROMISC |
 					    VSW_MACRX_COPYMSG);
 
-					if ((ret_m = vsw_tx_msg(vswp, mp))
-					    != NULL) {
+					if ((ret_m = vsw_tx_msg(vswp, mp,
+					    caller, arg)) != NULL) {
 						DERR(vswp, "%s: drop mblks to "
 						    "phys dev", __func__);
 						freemsgchain(ret_m);
@@ -539,8 +534,8 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
 					 * Pkt came down the stack, send out
 					 * over physical device.
 					 */
-					if ((ret_m = vsw_tx_msg(vswp, mp))
-					    != NULL) {
+					if ((ret_m = vsw_tx_msg(vswp, mp,
+					    caller, NULL)) != NULL) {
 						DERR(vswp, "%s: drop mblks to "
 						    "phys dev", __func__);
 						freemsgchain(ret_m);
@@ -566,8 +561,6 @@ vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
 {
 	struct ether_header	*ehp;
 	mblk_t			*bp = NULL;
-	mblk_t			*mpt;
-	uint32_t		count;
 	vsw_fdbe_t		*fp;
 
 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
@@ -587,8 +580,8 @@ vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
 	bp = mp;
 	while (bp) {
 		ehp = (struct ether_header *)bp->b_rptr;
-		count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp);
-		ASSERT(count != 0);
+		mp = vsw_get_same_dest_list(ehp, &bp);
+		ASSERT(mp != NULL);
 
 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
 		    __func__, MBLKSIZE(mp), MBLKL(mp));
@@ -601,7 +594,7 @@ vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
 		if (fp != NULL) {
 
 			D2(vswp, "%s: sending to target port", __func__);
-			(void) vsw_portsend(fp->portp, mp, mpt, count);
+			(void) vsw_portsend(fp->portp, mp);
 
 			/* Release the reference on the fdb entry */
 			VSW_FDBE_REFRELE(fp);
@@ -644,8 +637,7 @@ vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
 void
 vsw_setup_layer2_post_process(vsw_t *vswp)
 {
-	if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
-	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) {
+	if (vswp->smode & VSW_LAYER2) {
 		/*
 		 * Program unicst, mcst addrs of vsw
 		 * interface and ports in the physdev.
@@ -676,13 +668,13 @@ vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
 	 * Broadcast message from inside ldoms so send to outside
 	 * world if in either of layer 2 modes.
 	 */
-	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
-	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
+	if ((vswp->smode & VSW_LAYER2) &&
 	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
 
 		nmp = vsw_dupmsgchain(mp);
 		if (nmp) {
-			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
+			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
+			    != NULL) {
 				DERR(vswp, "%s: dropping pkt(s) "
 				    "consisting of %ld bytes of data for"
 				    " physical device", __func__, MBLKL(ret_m));
@@ -716,20 +708,12 @@ vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
 		} else {
 			nmp = vsw_dupmsgchain(mp);
 			if (nmp) {
-				mblk_t	*mpt = nmp;
-				uint32_t count = 1;
-
-				/* Find tail */
-				while (mpt->b_next != NULL) {
-					mpt = mpt->b_next;
-					count++;
-				}
 				/*
 				 * The plist->lockrw is protecting the
 				 * portp from getting destroyed here.
 				 * So, no ref_cnt is incremented here.
 				 */
-				(void) vsw_portsend(portp, nmp, mpt, count);
+				(void) vsw_portsend(portp, nmp);
 			} else {
 				DERR(vswp, "vsw_forward_all: nmp NULL");
 			}
@@ -772,12 +756,12 @@ vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
 	 * over the physical adapter, and then check to see if any other
 	 * vnets are interested in it.
 	 */
-	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
-	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
+	if ((vswp->smode & VSW_LAYER2) &&
 	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
 		nmp = vsw_dupmsgchain(mp);
 		if (nmp) {
-			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
+			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
+			    != NULL) {
 				DERR(vswp, "%s: dropping pkt(s) consisting of "
 				    "%ld bytes of data for physical device",
 				    __func__, MBLKL(ret_m));
@@ -819,21 +803,12 @@ vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
 
 				nmp = vsw_dupmsgchain(mp);
 				if (nmp) {
-					mblk_t	*mpt = nmp;
-					uint32_t count = 1;
-
-					/* Find tail */
-					while (mpt->b_next != NULL) {
-						mpt = mpt->b_next;
-						count++;
-					}
 					/*
 					 * The vswp->mfdbrw is protecting the
 					 * portp from getting destroyed here.
 					 * So, no ref_cnt is incremented here.
 					 */
-					(void) vsw_portsend(port, nmp, mpt,
-					    count);
+					(void) vsw_portsend(port, nmp);
 				}
 			} else {
 				vsw_mac_rx(vswp, NULL,
@@ -970,32 +945,46 @@ vsw_vlan_add_ids(void *arg, int type)
 		rv = mod_hash_insert(vswp->vlan_hashp,
 		    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
 		    (mod_hash_val_t)B_TRUE);
-		ASSERT(rv == 0);
+		if (rv != 0) {
+			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
+			    "the interface", vswp->instance, vswp->pvid);
+		}
 
 		for (i = 0; i < vswp->nvids; i++) {
 			rv = mod_hash_insert(vswp->vlan_hashp,
-			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]),
+			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i].vl_vid),
 			    (mod_hash_val_t)B_TRUE);
-			ASSERT(rv == 0);
+			if (rv != 0) {
+				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
+				    " for the interface", vswp->instance,
+				    vswp->pvid);
+			}
 		}
 
 	} else if (type == VSW_VNETPORT) {
 		vsw_port_t	*portp = (vsw_port_t *)arg;
+		vsw_t		*vswp = portp->p_vswp;
 
 		rv = mod_hash_insert(portp->vlan_hashp,
 		    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
 		    (mod_hash_val_t)B_TRUE);
-		ASSERT(rv == 0);
+		if (rv != 0) {
+			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
+			    "the port(%d)", vswp->instance, vswp->pvid,
+			    portp->p_instance);
+		}
 
 		for (i = 0; i < portp->nvids; i++) {
 			rv = mod_hash_insert(portp->vlan_hashp,
-			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]),
+			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i].vl_vid),
 			    (mod_hash_val_t)B_TRUE);
-			ASSERT(rv == 0);
+			if (rv != 0) {
+				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
+				    " for the port(%d)", vswp->instance,
+				    vswp->pvid, portp->p_instance);
+			}
 		}
 
-	} else {
-		return;
 	}
 }
 
@@ -1021,10 +1010,12 @@ vsw_vlan_remove_ids(void *arg, int type)
 		}
 
 		for (i = 0; i < vswp->nvids; i++) {
-			rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->vids[i]);
+			rv = vsw_vlan_lookup(vswp->vlan_hashp,
+			    vswp->vids[i].vl_vid);
 			if (rv == B_TRUE) {
 				rv = mod_hash_remove(vswp->vlan_hashp,
-				    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]),
+				    (mod_hash_key_t)VLAN_ID_KEY(
+				    vswp->vids[i].vl_vid),
 				    (mod_hash_val_t *)&vp);
 				ASSERT(rv == 0);
 			}
@@ -1043,10 +1034,12 @@ vsw_vlan_remove_ids(void *arg, int type)
 		}
 
 		for (i = 0; i < portp->nvids; i++) {
-			rv = vsw_vlan_lookup(portp->vlan_hashp, portp->vids[i]);
+			rv = vsw_vlan_lookup(portp->vlan_hashp,
+			    portp->vids[i].vl_vid);
 			if (rv == B_TRUE) {
 				rv = mod_hash_remove(portp->vlan_hashp,
-				    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]),
+				    (mod_hash_key_t)VLAN_ID_KEY(
+				    portp->vids[i].vl_vid),
 				    (mod_hash_val_t *)&vp);
 				ASSERT(rv == 0);
 			}
@@ -1097,7 +1090,11 @@ vsw_fdbe_add(vsw_t *vswp, void *port)
 	 */
 	rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
 	    (mod_hash_val_t)fp);
-	ASSERT(rv == 0);
+	if (rv != 0) {
+		cmn_err(CE_WARN, "vsw%d: Duplicate mac-address(%s) for "
+		    "the port(%d)", vswp->instance,
+		    ether_sprintf(&portp->p_macaddr), portp->p_instance);
+	}
 }
 
 /*
@@ -1264,7 +1261,7 @@ vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
  * Returns:
  *   np:     head of updated chain of packets
  *   npt:    tail of updated chain of packets
- *   rv:     count of any packets dropped
+ *   rv:     count of the packets in the returned list
  */
 uint32_t
 vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
@@ -1285,6 +1282,7 @@ vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
 
 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
 
+
 	if (type == VSW_LOCALDEV) {
 		vswp = (vsw_t *)arg;
 		pvid = vswp->pvid;
@@ -1298,6 +1296,27 @@ vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
 		pvid = portp->pvid;
 	}
 
+	/*
+	 * If the MAC layer switching in place, then
+	 * untagging required only if the pvid is not
+	 * the same as default_vlan_id. This is because,
+	 * the MAC layer will send packets for the
+	 * registered vlans only.
+	 */
+	if ((vswp->mac_cl_switching == B_TRUE) &&
+	    (pvid == vswp->default_vlan_id)) {
+		/* simply count and set the tail */
+		count = 1;
+		bp = *np;
+		ASSERT(bp != NULL);
+		while (bp->b_next != NULL) {
+			bp = bp->b_next;
+			count++;
+		}
+		*npt = bp;
+		return (count);
+	}
+
 	bpn = bph = bpt = NULL;
 	count = 0;
 
@@ -1313,45 +1332,67 @@ vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
 		is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
 
 		/*
-		 * Check if the destination is in the same vlan.
+		 * If MAC layer switching in place, then we
+		 * need to untag only if the tagged packet has
+		 * vlan-id same as the pvid.
 		 */
-		rv = vsw_vlan_lookup(vlan_hashp, vlan_id);
-		if (rv == B_FALSE) {
-			/* drop the packet */
-			freemsg(bp);
-			count++;
-			continue;
-		}
+		if (vswp->mac_cl_switching == B_TRUE) {
 
-		/*
-		 * Check the frame header if tag/untag is  needed.
-		 */
-		if (is_tagged == B_FALSE) {
-			/*
-			 * Untagged frame. We shouldn't have an untagged
-			 * packet at this point, unless the destination's
-			 * vlan id is default-vlan-id; if it is not the
-			 * default-vlan-id, we drop the packet.
-			 */
-			if (vlan_id != vswp->default_vlan_id) {
-				/* drop the packet */
-				freemsg(bp);
-				count++;
-				continue;
-			}
-		} else {
-			/*
-			 * Tagged frame, untag if it's the destination's pvid.
-			 */
+			/* only tagged packets expected here */
+			ASSERT(is_tagged == B_TRUE);
 			if (vlan_id == pvid) {
-
 				bp = vnet_vlan_remove_tag(bp);
 				if (bp == NULL) {
 					/* packet dropped */
-					count++;
 					continue;
 				}
 			}
+		} else { /* No MAC layer switching */
+
+			/*
+			 * Check the frame header if tag/untag is  needed.
+			 */
+			if (is_tagged == B_FALSE) {
+				/*
+				 * Untagged frame. We shouldn't have an
+				 * untagged packet at this point, unless
+				 * the destination's  vlan id is
+				 * default-vlan-id; if it is not the
+				 * default-vlan-id, we drop the packet.
+				 */
+				if (vlan_id != vswp->default_vlan_id) {
+					/* drop the packet */
+					freemsg(bp);
+					continue;
+				}
+			} else {	/* Tagged */
+				/*
+				 * Tagged frame, untag if it's the
+				 * destination's pvid.
+				 */
+				if (vlan_id == pvid) {
+
+					bp = vnet_vlan_remove_tag(bp);
+					if (bp == NULL) {
+						/* packet dropped */
+						continue;
+					}
+				} else {
+
+					/*
+					 * Check if the destination is in the
+					 * same vlan.
+					 */
+					rv = vsw_vlan_lookup(vlan_hashp,
+					    vlan_id);
+					if (rv == B_FALSE) {
+						/* drop the packet */
+						freemsg(bp);
+						continue;
+					}
+				}
+
+			}
 		}
 
 		/* build a chain of processed packets */
@@ -1361,12 +1402,11 @@ vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
 			bpt->b_next = bp;
 			bpt = bp;
 		}
-
+		count++;
 	}
 
 	*np = bph;
 	*npt = bpt;
-
 	return (count);
 }
 
@@ -1476,26 +1516,13 @@ vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
 				 * just increments a ref counter (which is
 				 * used when the address is being deleted)
 				 */
-				WRITE_ENTER(&vswp->mac_rwlock);
-				if (vswp->mh != NULL) {
-					if (mac_multicst_add(vswp->mh,
-					    (uchar_t *)&mcst_pkt->mca[i])) {
-						RW_EXIT(&vswp->mac_rwlock);
-						cmn_err(CE_WARN, "!vsw%d: "
-						    "unable to add multicast "
-						    "address: %s\n",
-						    vswp->instance,
-						    ether_sprintf((void *)
-						    &mcst_p->mca));
-						(void) vsw_del_mcst(vswp,
-						    VSW_VNETPORT, addr, port);
-						kmem_free(mcst_p,
-						    sizeof (*mcst_p));
-						return (1);
-					}
-					mcst_p->mac_added = B_TRUE;
+				if (vsw_mac_multicast_add(vswp, port, mcst_p,
+				    VSW_VNETPORT)) {
+					(void) vsw_del_mcst(vswp,
+					    VSW_VNETPORT, addr, port);
+					kmem_free(mcst_p, sizeof (*mcst_p));
+					return (1);
 				}
-				RW_EXIT(&vswp->mac_rwlock);
 
 				mutex_enter(&port->mca_lock);
 				mcst_p->nextp = port->mcap;
@@ -1530,24 +1557,8 @@ vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
 				 * if other ports are interested in this
 				 * address.
 				 */
-				WRITE_ENTER(&vswp->mac_rwlock);
-				if (vswp->mh != NULL && mcst_p->mac_added) {
-					if (mac_multicst_remove(vswp->mh,
-					    (uchar_t *)&mcst_pkt->mca[i])) {
-						RW_EXIT(&vswp->mac_rwlock);
-						cmn_err(CE_WARN, "!vsw%d: "
-						    "unable to remove mcast "
-						    "address: %s\n",
-						    vswp->instance,
-						    ether_sprintf((void *)
-						    &mcst_p->mca));
-						kmem_free(mcst_p,
-						    sizeof (*mcst_p));
-						return (1);
-					}
-					mcst_p->mac_added = B_FALSE;
-				}
-				RW_EXIT(&vswp->mac_rwlock);
+				vsw_mac_multicast_remove(vswp, port, mcst_p,
+				    VSW_VNETPORT);
 				kmem_free(mcst_p, sizeof (*mcst_p));
 
 			} else {
@@ -1780,13 +1791,7 @@ vsw_del_mcst_port(vsw_port_t *port)
 		 * if other ports are interested in this
 		 * address.
 		 */
-		WRITE_ENTER(&vswp->mac_rwlock);
-		if (vswp->mh != NULL && mcap->mac_added) {
-			(void) mac_multicst_remove(vswp->mh,
-			    (uchar_t *)&mcap->mca);
-		}
-		RW_EXIT(&vswp->mac_rwlock);
-
+		vsw_mac_multicast_remove(vswp, port, mcap, VSW_VNETPORT);
 		kmem_free(mcap, sizeof (*mcap));
 
 		mutex_enter(&port->mca_lock);
@@ -1829,11 +1834,9 @@ vsw_del_mcst_vsw(vsw_t *vswp)
 	D1(vswp, "%s: exit", __func__);
 }
 
-static uint32_t
-vsw_get_same_dest_list(struct ether_header *ehp,
-    mblk_t **rhead, mblk_t **rtail, mblk_t **mpp)
+mblk_t *
+vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp)
 {
-	uint32_t		count = 0;
 	mblk_t			*bp;
 	mblk_t			*nbp;
 	mblk_t			*head = NULL;
@@ -1860,16 +1863,12 @@ vsw_get_same_dest_list(struct ether_header *ehp,
 				tail->b_next = bp;
 				tail = bp;
 			}
-			count++;
 		} else {
 			prev = bp;
 		}
 		bp = nbp;
 	}
-	*rhead = head;
-	*rtail = tail;
-	DTRACE_PROBE1(vsw_same_dest, int, count);
-	return (count);
+	return (head);
 }
 
 static mblk_t *
diff --git a/usr/src/uts/sun4v/os/mach_startup.c b/usr/src/uts/sun4v/os/mach_startup.c
index 694930fe28..df698ebe69 100644
--- a/usr/src/uts/sun4v/os/mach_startup.c
+++ b/usr/src/uts/sun4v/os/mach_startup.c
@@ -308,18 +308,18 @@ mach_hw_copy_limit(void)
 }
 
 /*
- * We need to enable soft ring functionality on Niagara platform since
- * one strand can't handle interrupts for a 1Gb NIC. Set the tunable
- * ip_squeue_soft_ring by default on this platform. We can also set
- * ip_threads_per_cpu to track number of threads per core. The variables
- * themselves are defined in space.c and used by IP module
+ * We need to enable soft ring functionality on Niagara platforms since
+ * one strand can't handle interrupts for a 1Gb NIC. So set the tunable
+ * mac_soft_ring_enable by default on this platform.
+ * mac_soft_ring_enable variable is defined in space.c and used by MAC
+ * module. This tunable in concert with mac_soft_ring_count (declared
+ * in mac.h) will configure the number of fanout soft rings for a link.
  */
-extern uint_t ip_threads_per_cpu;
-extern boolean_t ip_squeue_soft_ring;
+extern boolean_t mac_soft_ring_enable;
 void
 startup_platform(void)
 {
-	ip_squeue_soft_ring = B_TRUE;
+	mac_soft_ring_enable = B_TRUE;
 	if (clock_tick_threshold == 0)
 		clock_tick_threshold = SUN4V_CLOCK_TICK_THRESHOLD;
 	if (clock_tick_ncpus == 0)
diff --git a/usr/src/uts/sun4v/sys/vnet_res.h b/usr/src/uts/sun4v/sys/vnet_res.h
index 035ad1328c..b5cd4472fb 100644
--- a/usr/src/uts/sun4v/sys/vnet_res.h
+++ b/usr/src/uts/sun4v/sys/vnet_res.h
@@ -27,12 +27,12 @@
 #ifndef _VNET_RES_H
 #define	_VNET_RES_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#include <sys/mac_provider.h>
+
 /*
  * Vio network resource types.
  * VIO_NET_RES_LDC_SERVICE:
diff --git a/usr/src/uts/sun4v/sys/vsw.h b/usr/src/uts/sun4v/sys/vsw.h
index 069e26d60a..456480f909 100644
--- a/usr/src/uts/sun4v/sys/vsw.h
+++ b/usr/src/uts/sun4v/sys/vsw.h
@@ -40,6 +40,7 @@ extern "C" {
 #include <sys/vio_mailbox.h>
 #include <sys/vnet_common.h>
 #include <sys/ethernet.h>
+#include <sys/mac_client.h>
 #include <sys/vio_util.h>
 #include <sys/vgen_stats.h>
 #include <sys/vsw_ldc.h>
@@ -59,57 +60,6 @@ extern "C" {
 #define	VSW_LOCALDEV		4	/* vsw configured as an eth interface */
 
 /*
- * Vsw queue -- largely modeled after squeue
- *
- * VSW_QUEUE_RUNNING, vqueue thread for queue is running.
- * VSW_QUEUE_DRAINED, vqueue thread has drained current work and is exiting.
- * VSW_QUEUE_STOP, request for the vqueue thread to stop.
- * VSW_QUEUE_STOPPED, vqueue thread is not running.
- */
-#define	VSW_QUEUE_RUNNING	0x01
-#define	VSW_QUEUE_DRAINED	0x02
-#define	VSW_QUEUE_STOP		0x04
-#define	VSW_QUEUE_STOPPED	0x08
-
-typedef struct vsw_queue_s {
-	kmutex_t	vq_lock;	/* Lock, before using any member. */
-	kcondvar_t	vq_cv;		/* Async threads block on. */
-	uint32_t	vq_state;	/* State flags. */
-
-	mblk_t		*vq_first;	/* First mblk chain or NULL. */
-	mblk_t		*vq_last;	/* Last mblk chain. */
-
-	processorid_t	vq_bind;	/* Process to bind to */
-	kthread_t	*vq_worker;	/* Queue's thread */
-} vsw_queue_t;
-
-/*
- * VSW MAC Ring Resources.
- *	MAC Ring resource is composed of this state structure and
- *	a kernel thread to perform the processing of the ring.
- */
-typedef struct vsw_mac_ring_s {
-	uint32_t	ring_state;
-
-	mac_blank_t	ring_blank;
-	void		*ring_arg;
-
-	vsw_queue_t	*ring_vqp;
-	struct vsw	*ring_vswp;
-} vsw_mac_ring_t;
-
-/*
- * Maximum Ring Resources.
- */
-#define	VSW_MAC_RX_RINGS	0x40
-
-/*
- * States for entry in ring table.
- */
-#define	VSW_MAC_RING_FREE	1
-#define	VSW_MAC_RING_INUSE	2
-
-/*
  * Number of hash chains in the multicast forwarding database.
  */
 #define		VSW_NCHAINS	8
@@ -139,6 +89,15 @@ typedef struct vsw_mac_ring_s {
 #define	VSW_PRI_ETH_DEFINED(vswp)	((vswp)->pri_num_types != 0)
 
 /*
+ * vlan-id information.
+ */
+typedef struct vsw_vlanid {
+	uint16_t		vl_vid;		/* vlan-id */
+	mac_unicast_handle_t	vl_muh;		/* mac unicast handle */
+	boolean_t		vl_set;		/* set? */
+} vsw_vlanid_t;
+
+/*
  * vsw instance state information.
  */
 typedef struct	vsw {
@@ -147,9 +106,7 @@ typedef struct	vsw {
 	uint64_t		regprop;	/* "reg" property */
 	struct vsw		*next;		/* next in list */
 	char			physname[LIFNAMSIZ];	/* phys-dev */
-	uint8_t			smode[NUM_SMODES];	/* switching mode */
-	int			smode_idx;	/* curr pos in smode array */
-	int			smode_num;	/* # of modes specified */
+	uint8_t			smode;		/* switching mode */
 	kmutex_t		swtmout_lock;	/* setup switching tmout lock */
 	boolean_t		swtmout_enabled; /* setup switching tmout on */
 	timeout_id_t		swtmout_id;	/* setup switching tmout id */
@@ -174,24 +131,16 @@ typedef struct	vsw {
 					vsw_port_t *, mac_resource_handle_t);
 
 	/* mac layer */
-	krwlock_t		mac_rwlock;	/* protect fields below */
+	kmutex_t		mac_lock;	/* protect mh */
 	mac_handle_t		mh;
-	mac_rx_handle_t		mrh;
-	multiaddress_capab_t	maddr;		/* Multiple uni addr capable */
-	const mac_txinfo_t	*txinfo;	/* MAC tx routine */
-	boolean_t		mstarted;	/* Mac Started? */
-	boolean_t		mresources;	/* Mac Resources cb? */
-
-	/*
-	 * MAC Ring Resources.
-	 */
-	kmutex_t		mac_ring_lock;	/* Lock for the table. */
-	uint32_t		mac_ring_tbl_sz;
-	vsw_mac_ring_t		*mac_ring_tbl;	/* Mac ring table. */
-
-	kmutex_t		hw_lock;	/* sync access to HW */
+	krwlock_t		maccl_rwlock;	/* protect fields below */
+	mac_client_handle_t	mch;		/* mac client handle */
+	mac_unicast_handle_t	muh;		/* mac unicast handle */
+
 	boolean_t		recfg_reqd;	/* Reconfig of addrs needed */
-	int			promisc_cnt;
+
+	/* mac layer switching flag */
+	boolean_t		mac_cl_switching;
 
 	/* Machine Description updates  */
 	mdeg_node_spec_t	*inst_spec;
@@ -204,8 +153,7 @@ typedef struct	vsw {
 	krwlock_t		if_lockrw;
 	uint8_t			if_state;	/* interface state */
 
-	mac_addr_slot_t		addr_slot;	/* Unicast address slot */
-	int			addr_set;	/* Addr set where */
+	boolean_t		addr_set;	/* is addr set to HW */
 
 	/* multicast addresses when configured as eth interface */
 	kmutex_t		mca_lock;	/* multicast lock */
@@ -216,7 +164,7 @@ typedef struct	vsw {
 	vio_mblk_pool_t		*pri_tx_vmp;	/* tx priority mblk pool */
 	uint16_t		default_vlan_id; /* default vlan id */
 	uint16_t		pvid;	/* port vlan id (untagged) */
-	uint16_t		*vids;	/* vlan ids (tagged) */
+	vsw_vlanid_t		*vids;	/* vlan ids (tagged) */
 	uint16_t		nvids;	/* # of vids */
 	uint32_t		vids_size; /* size alloc'd for vids list */
 
diff --git a/usr/src/uts/sun4v/sys/vsw_hio.h b/usr/src/uts/sun4v/sys/vsw_hio.h
index 70b79ea04e..1521d6cff9 100644
--- a/usr/src/uts/sun4v/sys/vsw_hio.h
+++ b/usr/src/uts/sun4v/sys/vsw_hio.h
@@ -55,10 +55,6 @@ typedef struct vsw_share {
 	uint64_t	vs_macaddr;	/* Associated MAC addr */
 	uint64_t	vs_cookie;	/* Share Cookie from alloc_share */
 
-	/* physdev's share related info */
-	mac_share_handle_t	vs_shdl;	/* HIO share handle */
-	mac_group_info_t	vs_rxginfo;	/* RX group info */
-	uint64_t		vs_gnum;	/* RX group number */
 } vsw_share_t;
 
 #define	VSW_SHARE_FREE		0x0
@@ -68,11 +64,8 @@ typedef struct vsw_share {
 
 /* Hybrid related info */
 typedef struct vsw_hio {
-	mac_capab_rings_t	vh_rcapab;	/* Rings capability data */
-	mac_capab_share_t	vh_scapab;	/* Share capability data */
-	vsw_share_t		*vh_shares;	/* Array of Shares */
 	uint32_t		vh_num_shares;	/* Number of shares available */
-
+	vsw_share_t		*vh_shares;	/* Array of Shares */
 	uint32_t		vh_kstat_size;	/* size for the whole kstats */
 	vsw_hio_kstats_t	*vh_kstatsp;	/* stats for vsw hio */
 	kstat_t			*vh_ksp;	/* kstats */
diff --git a/usr/src/uts/sun4v/sys/vsw_ldc.h b/usr/src/uts/sun4v/sys/vsw_ldc.h
index 31344465f5..46d04fac10 100644
--- a/usr/src/uts/sun4v/sys/vsw_ldc.h
+++ b/usr/src/uts/sun4v/sys/vsw_ldc.h
@@ -362,10 +362,6 @@ typedef struct mcst_addr {
 #define	VSW_PORT_DETACHING	0x2	/* In process of being detached */
 #define	VSW_PORT_DETACHABLE	0x4	/* Safe to detach */
 
-#define	VSW_ADDR_UNSET		0x0	/* Addr not set */
-#define	VSW_ADDR_HW		0x1	/* Addr programmed in HW */
-#define	VSW_ADDR_PROMISC	0x2	/* Card in promisc to see addr */
-
 /* port information associated with a vsw */
 typedef struct vsw_port {
 	int			p_instance;	/* port instance */
@@ -382,20 +378,22 @@ typedef struct vsw_port {
 	kmutex_t		state_lock;
 	kcondvar_t		state_cv;
 
+	krwlock_t		maccl_rwlock;	/* protect fields below */
+	mac_client_handle_t	p_mch;		/* mac client handle */
+	mac_unicast_handle_t	p_muh;		/* mac unicast handle */
+
 	kmutex_t		mca_lock;	/* multicast lock */
 	mcst_addr_t		*mcap;		/* list of multicast addrs */
 
-	mac_addr_slot_t		addr_slot;	/* Unicast address slot */
-	int			addr_set;	/* Addr set where */
+	boolean_t		addr_set;	/* Addr set where */
 
 	/*
 	 * mac address of the port & connected device
 	 */
 	struct ether_addr	p_macaddr;
 	uint16_t		pvid;	/* port vlan id (untagged) */
-	uint16_t		*vids;	/* vlan ids (tagged) */
+	struct vsw_vlanid	*vids;	/* vlan ids (tagged) */
 	uint16_t		nvids;	/* # of vids */
-	uint32_t		vids_size; /* size alloc'd for vids list */
 	mod_hash_t		*vlan_hashp;	/* vlan hash table */
 	uint32_t		vlan_nchains;	/* # of vlan hash chains */
 
@@ -444,7 +442,7 @@ static	struct	ether_addr	etherbroadcastaddr = {
 };
 
 #define	IS_BROADCAST(ehp) \
-	(ether_cmp(&ehp->ether_dhost, &etherbroadcastaddr) == 0)
+	(bcmp(&ehp->ether_dhost, &etherbroadcastaddr, ETHERADDRL) == 0)
 #define	IS_MULTICAST(ehp) \
 	((ehp->ether_dhost.ether_addr_octet[0] & 01) == 1)
author	Eric Cheng <none@none>	2008-12-04 18:16:10 -0800
committer	Eric Cheng <none@none>	2008-12-04 18:16:10 -0800
commit	da14cebe459d3275048785f25bd869cb09b5307f (patch)
tree	a394d2c61ec4d7591782a4a5db4e3a157c3ca89a /usr/src
parent	03361682bf38acf5bcc36ee83a0d6277731eee68 (diff)
download	illumos-joyent-da14cebe459d3275048785f25bd869cb09b5307f.tar.gz